From e249b33b24817fda0c0296eec2d7445bd4ec3ab5 Mon Sep 17 00:00:00 2001 From: Normen Seemann Date: Tue, 2 Dec 2025 19:44:59 +0100 Subject: [PATCH 01/17] delete code path -- everything broken --- .../async/hnsw/CompactStorageAdapter.java | 6 + .../apple/foundationdb/async/hnsw/Config.java | 71 +- .../apple/foundationdb/async/hnsw/HNSW.java | 613 +++++++++++++----- .../async/hnsw/InliningStorageAdapter.java | 9 + .../async/hnsw/NodeReferenceAndNode.java | 16 +- .../async/hnsw/StorageAdapter.java | 3 + .../foundationdb/async/hnsw/ConfigTest.java | 4 - .../foundationdb/async/hnsw/HNSWTest.java | 3 - .../record/metadata/IndexOptions.java | 7 - .../indexes/VectorIndexHelper.java | 4 - .../indexes/VectorIndexMaintainerFactory.java | 2 - .../foundationdb/indexes/VectorIndexTest.java | 5 - 12 files changed, 509 insertions(+), 234 deletions(-) diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/CompactStorageAdapter.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/CompactStorageAdapter.java index b03c296f67..5013d3a648 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/CompactStorageAdapter.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/CompactStorageAdapter.java @@ -73,6 +73,12 @@ public CompactStorageAdapter(@Nonnull final Config config, super(config, nodeFactory, subspace, onWriteListener, onReadListener); } + @Nonnull + @Override + public Transformed getVector(@Nonnull final NodeReference nodeReference, @Nonnull final AbstractNode node) { + return node.asCompactNode().getVector(); + } + /** * Asynchronously fetches a node from the database for a given layer and primary key. *

diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/Config.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/Config.java index efa2d3181b..82b945785d 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/Config.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/Config.java @@ -32,7 +32,6 @@ */ @SuppressWarnings("checkstyle:MemberName") public final class Config { - public static final boolean DEFAULT_DETERMINISTIC_SEEDING = false; @Nonnull public static final Metric DEFAULT_METRIC = Metric.EUCLIDEAN_METRIC; public static final boolean DEFAULT_USE_INLINING = false; public static final int DEFAULT_M = 16; @@ -53,7 +52,6 @@ public final class Config { public static final int DEFAULT_MAX_NUM_CONCURRENT_NODE_FETCHES = 16; public static final int DEFAULT_MAX_NUM_CONCURRENT_NEIGHBOR_FETCHES = 16; - private final boolean deterministicSeeding; @Nonnull private final Metric metric; private final int numDimensions; @@ -72,12 +70,12 @@ public final class Config { private final int maxNumConcurrentNodeFetches; private final int maxNumConcurrentNeighborhoodFetches; - private Config(final boolean deterministicSeeding, @Nonnull final Metric metric, final int numDimensions, - final boolean useInlining, final int m, final int mMax, final int mMax0, - final int efConstruction, final boolean extendCandidates, final boolean keepPrunedConnections, - final double sampleVectorStatsProbability, final double maintainStatsProbability, - final int statsThreshold, final boolean useRaBitQ, final int raBitQNumExBits, - final int maxNumConcurrentNodeFetches, final int maxNumConcurrentNeighborhoodFetches) { + private Config(@Nonnull final Metric metric, final int numDimensions, final boolean useInlining, final int m, + final int mMax, final int mMax0, final int efConstruction, final boolean extendCandidates, + final boolean keepPrunedConnections, final double sampleVectorStatsProbability, + final double maintainStatsProbability, final int statsThreshold, final boolean useRaBitQ, + final int raBitQNumExBits, final int maxNumConcurrentNodeFetches, + final int maxNumConcurrentNeighborhoodFetches) { Preconditions.checkArgument(numDimensions >= 1, "numDimensions must be (1, MAX_INT]"); Preconditions.checkArgument(m >= 4 && m <= 200, "m must be [4, 200]"); Preconditions.checkArgument(mMax >= 4 && mMax <= 200, "mMax must be [4, 200]"); @@ -101,7 +99,6 @@ private Config(final boolean deterministicSeeding, @Nonnull final Metric metric, maxNumConcurrentNeighborhoodFetches <= 64, "maxNumConcurrentNeighborhoodFetches must be (0, 64]"); - this.deterministicSeeding = deterministicSeeding; this.metric = metric; this.numDimensions = numDimensions; this.useInlining = useInlining; @@ -120,15 +117,6 @@ private Config(final boolean deterministicSeeding, @Nonnull final Metric metric, this.maxNumConcurrentNeighborhoodFetches = maxNumConcurrentNeighborhoodFetches; } - /** - * Indicator that if {@code true} causes the insert logic of the HNSW to be seeded using a hash of the primary key - * of the record that is inserted. That can be useful for testing. If {@code isDeterministicSeeding} is - * {@code false}, we use {@link System#nanoTime()} for seeding. - */ - public boolean isDeterministicSeeding() { - return deterministicSeeding; - } - /** * The metric that is used to determine distances between vectors. */ @@ -297,7 +285,7 @@ public int getMaxNumConcurrentNeighborhoodFetches() { @Nonnull public ConfigBuilder toBuilder() { - return new ConfigBuilder(isDeterministicSeeding(), getMetric(), isUseInlining(), getM(), getMMax(), getMMax0(), + return new ConfigBuilder(getMetric(), isUseInlining(), getM(), getMMax(), getMMax0(), getEfConstruction(), isExtendCandidates(), isKeepPrunedConnections(), getSampleVectorStatsProbability(), getMaintainStatsProbability(), getStatsThreshold(), isUseRaBitQ(), getRaBitQNumExBits(), getMaxNumConcurrentNodeFetches(), @@ -313,10 +301,9 @@ public boolean equals(final Object o) { return false; } final Config config = (Config)o; - return deterministicSeeding == config.deterministicSeeding && numDimensions == config.numDimensions && - useInlining == config.useInlining && m == config.m && mMax == config.mMax && mMax0 == config.mMax0 && - efConstruction == config.efConstruction && extendCandidates == config.extendCandidates && - keepPrunedConnections == config.keepPrunedConnections && + return numDimensions == config.numDimensions && useInlining == config.useInlining && m == config.m && + mMax == config.mMax && mMax0 == config.mMax0 && efConstruction == config.efConstruction && + extendCandidates == config.extendCandidates && keepPrunedConnections == config.keepPrunedConnections && Double.compare(sampleVectorStatsProbability, config.sampleVectorStatsProbability) == 0 && Double.compare(maintainStatsProbability, config.maintainStatsProbability) == 0 && statsThreshold == config.statsThreshold && useRaBitQ == config.useRaBitQ && @@ -327,17 +314,17 @@ public boolean equals(final Object o) { @Override public int hashCode() { - return Objects.hash(deterministicSeeding, metric, numDimensions, useInlining, m, mMax, mMax0, efConstruction, - extendCandidates, keepPrunedConnections, sampleVectorStatsProbability, maintainStatsProbability, - statsThreshold, useRaBitQ, raBitQNumExBits, maxNumConcurrentNodeFetches, maxNumConcurrentNeighborhoodFetches); + return Objects.hash(metric, numDimensions, useInlining, m, mMax, mMax0, efConstruction, extendCandidates, + keepPrunedConnections, sampleVectorStatsProbability, maintainStatsProbability, statsThreshold, + useRaBitQ, raBitQNumExBits, maxNumConcurrentNodeFetches, maxNumConcurrentNeighborhoodFetches); } @Override @Nonnull public String toString() { - return "Config[deterministicSeeding=" + isDeterministicSeeding() + ", metric=" + getMetric() + - ", numDimensions=" + getNumDimensions() + ", isUseInlining=" + isUseInlining() + ", M=" + getM() + - ", MMax=" + getMMax() + ", MMax0=" + getMMax0() + ", efConstruction=" + getEfConstruction() + + return "Config[" + ", metric=" + getMetric() + ", numDimensions=" + getNumDimensions() + + ", isUseInlining=" + isUseInlining() + ", M=" + getM() + ", MMax=" + getMMax() + + ", MMax0=" + getMMax0() + ", efConstruction=" + getEfConstruction() + ", isExtendCandidates=" + isExtendCandidates() + ", isKeepPrunedConnections=" + isKeepPrunedConnections() + ", sampleVectorStatsProbability=" + getSampleVectorStatsProbability() + @@ -356,7 +343,6 @@ public String toString() { @CanIgnoreReturnValue @SuppressWarnings("checkstyle:MemberName") public static class ConfigBuilder { - private boolean deterministicSeeding = DEFAULT_DETERMINISTIC_SEEDING; @Nonnull private Metric metric = DEFAULT_METRIC; private boolean useInlining = DEFAULT_USE_INLINING; @@ -380,13 +366,12 @@ public static class ConfigBuilder { public ConfigBuilder() { } - public ConfigBuilder(final boolean deterministicSeeding, @Nonnull final Metric metric, final boolean useInlining, - final int m, final int mMax, final int mMax0, final int efConstruction, - final boolean extendCandidates, final boolean keepPrunedConnections, - final double sampleVectorStatsProbability, final double maintainStatsProbability, - final int statsThreshold, final boolean useRaBitQ, final int raBitQNumExBits, - final int maxNumConcurrentNodeFetches, final int maxNumConcurrentNeighborhoodFetches) { - this.deterministicSeeding = deterministicSeeding; + public ConfigBuilder(@Nonnull final Metric metric, final boolean useInlining, final int m, final int mMax, + final int mMax0, final int efConstruction, final boolean extendCandidates, + final boolean keepPrunedConnections, final double sampleVectorStatsProbability, + final double maintainStatsProbability, final int statsThreshold, final boolean useRaBitQ, + final int raBitQNumExBits, final int maxNumConcurrentNodeFetches, + final int maxNumConcurrentNeighborhoodFetches) { this.metric = metric; this.useInlining = useInlining; this.m = m; @@ -404,16 +389,6 @@ public ConfigBuilder(final boolean deterministicSeeding, @Nonnull final Metric m this.maxNumConcurrentNeighborhoodFetches = maxNumConcurrentNeighborhoodFetches; } - public boolean isDeterministicSeeding() { - return deterministicSeeding; - } - - @Nonnull - public ConfigBuilder setDeterministicSeeding(final boolean deterministicSeeding) { - this.deterministicSeeding = deterministicSeeding; - return this; - } - @Nonnull public Metric getMetric() { return metric; @@ -564,7 +539,7 @@ public ConfigBuilder setMaxNumConcurrentNeighborhoodFetches(final int maxNumConc } public Config build(final int numDimensions) { - return new Config(isDeterministicSeeding(), getMetric(), numDimensions, isUseInlining(), getM(), getMMax(), + return new Config(getMetric(), numDimensions, isUseInlining(), getM(), getMMax(), getMMax0(), getEfConstruction(), isExtendCandidates(), isKeepPrunedConnections(), getSampleVectorStatsProbability(), getMaintainStatsProbability(), getStatsThreshold(), isUseRaBitQ(), getRaBitQNumExBits(), getMaxNumConcurrentNodeFetches(), diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java index 639cd2f273..2565d0e2ef 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java @@ -38,6 +38,8 @@ import com.apple.foundationdb.tuple.Tuple; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Maps; @@ -63,7 +65,9 @@ import java.util.function.BiFunction; import java.util.function.Consumer; import java.util.function.Function; +import java.util.function.Predicate; import java.util.stream.Collectors; +import java.util.stream.IntStream; import static com.apple.foundationdb.async.MoreAsyncUtil.forEach; import static com.apple.foundationdb.async.MoreAsyncUtil.forLoop; @@ -325,7 +329,7 @@ private Quantizer quantizer(@Nullable final AccessInfo accessInfo) { @Nonnull private ImmutableList postProcessNearestNeighbors(@Nonnull final AffineOperator storageTransform, final int k, - @Nonnull final List> nearestNeighbors, + @Nonnull final List> nearestNeighbors, final boolean includeVectors) { final int lastIndex = Math.max(nearestNeighbors.size() - k, 0); @@ -335,7 +339,7 @@ private Quantizer quantizer(@Nullable final AccessInfo accessInfo) { for (int i = nearestNeighbors.size() - 1; i >= lastIndex; i --) { final var nodeReferenceAndNode = nearestNeighbors.get(i); final var nodeReference = - Objects.requireNonNull(nodeReferenceAndNode).getNodeReferenceWithDistance(); + Objects.requireNonNull(nodeReferenceAndNode).getNodeReference(); final AbstractNode node = nodeReferenceAndNode.getNode(); @Nullable final RealVector reconstructedVector = includeVectors ? storageTransform.untransform(node.asCompactNode().getVector()) : null; @@ -379,7 +383,7 @@ private Quantizer quantizer(@Nullable final AccessInfo accessInfo) { return searchLayer(storageAdapter, readTransaction, storageTransform, estimator, ImmutableList.of(nodeReference), layer, 1, Maps.newConcurrentMap(), queryVector) .thenApply(searchResult -> - Iterables.getOnlyElement(searchResult).getNodeReferenceWithDistance()); + Iterables.getOnlyElement(searchResult).getNodeReference()); } /** @@ -413,7 +417,7 @@ private Quantizer quantizer(@Nullable final AccessInfo accessInfo) { * best candidate nodes found in this layer, paired with their full node data. */ @Nonnull - private CompletableFuture>> + private CompletableFuture>> searchLayer(@Nonnull final StorageAdapter storageAdapter, @Nonnull final ReadTransaction readTransaction, @Nonnull final AffineOperator storageTransform, @@ -452,7 +456,7 @@ private Quantizer quantizer(@Nullable final AccessInfo accessInfo) { .thenApply(candidateNode -> Iterables.filter(candidateNode.getNeighbors(), neighbor -> !visited.contains(Objects.requireNonNull(neighbor).getPrimaryKey()))) - .thenCompose(neighborReferences -> fetchNeighborhood(storageAdapter, readTransaction, + .thenCompose(neighborReferences -> fetchNeighborhoodReferences(storageAdapter, readTransaction, storageTransform, layer, neighborReferences, nodeCache)) .thenApply(neighborReferences -> { for (final NodeReferenceWithVector current : neighborReferences) { @@ -484,9 +488,9 @@ private Quantizer quantizer(@Nullable final AccessInfo accessInfo) { searchResult.stream() .map(nodeReferenceAndNode -> "(primaryKey=" + - nodeReferenceAndNode.getNodeReferenceWithDistance().getPrimaryKey() + + nodeReferenceAndNode.getNodeReference().getPrimaryKey() + ",distance=" + - nodeReferenceAndNode.getNodeReferenceWithDistance().getDistance() + ")") + nodeReferenceAndNode.getNodeReference().getDistance() + ")") .collect(Collectors.joining(","))); } return searchResult; @@ -605,12 +609,12 @@ private Quantizer quantizer(@Nullable final AccessInfo accessInfo) { */ @Nonnull private CompletableFuture> - fetchNeighborhood(@Nonnull final StorageAdapter storageAdapter, - @Nonnull final ReadTransaction readTransaction, - @Nonnull final AffineOperator storageTransform, - final int layer, - @Nonnull final Iterable neighborReferences, - @Nonnull final Map> nodeCache) { + fetchNeighborhoodReferences(@Nonnull final StorageAdapter storageAdapter, + @Nonnull final ReadTransaction readTransaction, + @Nonnull final AffineOperator storageTransform, + final int layer, + @Nonnull final Iterable neighborReferences, + @Nonnull final Map> nodeCache) { return fetchSomeNodesAndApply(storageAdapter, readTransaction, storageTransform, layer, neighborReferences, neighborReference -> { if (neighborReference.isNodeReferenceWithVector()) { @@ -659,12 +663,12 @@ private Quantizer quantizer(@Nullable final AccessInfo accessInfo) { * objects, pairing each requested reference with its corresponding node. */ @Nonnull - private CompletableFuture>> + private CompletableFuture>> fetchSomeNodesIfNotCached(@Nonnull final StorageAdapter storageAdapter, @Nonnull final ReadTransaction readTransaction, @Nonnull final AffineOperator storageTransform, final int layer, - @Nonnull final Iterable nodeReferences, + @Nonnull final Iterable nodeReferences, @Nonnull final Map> nodeCache) { return fetchSomeNodesAndApply(storageAdapter, readTransaction, storageTransform, layer, nodeReferences, nodeReference -> { @@ -726,13 +730,13 @@ private Quantizer quantizer(@Nullable final AccessInfo accessInfo) { /** * Inserts a new vector with its associated primary key into the HNSW graph. *

- * The method first determines a random layer for the new node, called the {@code insertionLayer}. + * The method first determines a layer for the new node, called the {@code top layer}. * It then traverses the graph from the entry point downwards, greedily searching for the nearest * neighbors to the {@code newVector} at each layer. This search identifies the optimal * connection points for the new node. *

* Once the nearest neighbors are found, the new node is linked into the graph structure at all - * layers up to its {@code insertionLayer}. Special handling is included for inserting the + * layers up to its {@code top layer}. Special handling is included for inserting the * first-ever node into the graph or when a new node's layer is higher than any existing node, * which updates the graph's entry point. All operations are performed asynchronously. * @@ -746,7 +750,7 @@ private Quantizer quantizer(@Nullable final AccessInfo accessInfo) { public CompletableFuture insert(@Nonnull final Transaction transaction, @Nonnull final Tuple newPrimaryKey, @Nonnull final RealVector newVector) { final SplittableRandom random = random(newPrimaryKey); - final int insertionLayer = insertionLayer(random); + final int insertionLayer = topLayer(newPrimaryKey); if (logger.isTraceEnabled()) { logger.trace("new node with key={} selected to be inserted into layer={}", newPrimaryKey, insertionLayer); } @@ -951,7 +955,7 @@ private AggregatedVector aggregateVectors(@Nonnull final Iterable * This method implements the second phase of the HNSW insertion algorithm. It begins at a starting layer, which is * the minimum of the graph's maximum layer ({@code lMax}) and the new node's randomly assigned - * {@code insertionLayer}. It then iterates downwards to layer 0. In each layer, it invokes + * {@code layer}. It then iterates downwards to layer 0. In each layer, it invokes * {@link #insertIntoLayer(StorageAdapter, Transaction, AffineOperator, Quantizer, List, int, Tuple, Transformed)} * to perform the search and connect the new node. The set of nearest neighbors found at layer {@code L} serves as * the entry points for the search at layer {@code L-1}. @@ -1051,8 +1055,11 @@ private CompletableFuture insertIntoLayers(@Nonnull final Transaction tran .thenCompose(searchResult -> { final List references = NodeReferenceAndNode.getReferences(searchResult); - return selectNeighbors(storageAdapter, transaction, storageTransform, estimator, searchResult, - layer, getConfig().getM(), getConfig().isExtendCandidates(), nodeCache, newVector) + return extendCandidatesIfNecessary(storageAdapter, transaction, storageTransform, estimator, + searchResult, layer, getConfig().isExtendCandidates(), nodeCache, newVector) + .thenCompose(extendedCandidates -> + selectNeighbors(storageAdapter, transaction, storageTransform, estimator, + extendedCandidates, layer, getConfig().getM(), nodeCache)) .thenCompose(selectedNeighbors -> { final NodeFactory nodeFactory = storageAdapter.getNodeFactory(); @@ -1061,42 +1068,50 @@ layer, getConfig().getM(), getConfig().isExtendCandidates(), nodeCache, newVecto NodeReferenceAndNode.getReferences(selectedNeighbors)); final NeighborsChangeSet newNodeChangeSet = - new InsertNeighborsChangeSet<>(new BaseNeighborsChangeSet<>(ImmutableList.of()), + new InsertNeighborsChangeSet<>( + new BaseNeighborsChangeSet<>(ImmutableList.of()), newNode.getNeighbors()); - storageAdapter.writeNode(transaction, quantizer, newNode, layer, newNodeChangeSet); + storageAdapter.writeNode(transaction, quantizer, newNode, layer, + newNodeChangeSet); // create change sets for each selected neighbor and insert new node into them final Map> neighborChangeSetMap = Maps.newLinkedHashMap(); - for (final NodeReferenceAndNode selectedNeighbor : selectedNeighbors) { + for (final NodeReferenceAndNode selectedNeighbor : selectedNeighbors) { final NeighborsChangeSet baseSet = - new BaseNeighborsChangeSet<>(selectedNeighbor.getNode().getNeighbors()); + new BaseNeighborsChangeSet<>( + selectedNeighbor.getNode().getNeighbors()); final NeighborsChangeSet insertSet = - new InsertNeighborsChangeSet<>(baseSet, ImmutableList.of(newNode.getSelfReference(newVector))); + new InsertNeighborsChangeSet<>(baseSet, + ImmutableList.of(newNode.getSelfReference(newVector))); neighborChangeSetMap.put(selectedNeighbor.getNode().getPrimaryKey(), insertSet); } - final int currentMMax = layer == 0 ? getConfig().getMMax0() : getConfig().getMMax(); + final int currentMMax = layer == 0 + ? getConfig().getMMax0() + : getConfig().getMMax(); + return forEach(selectedNeighbors, - selectedNeighbor -> { - final AbstractNode selectedNeighborNode = selectedNeighbor.getNode(); - final NeighborsChangeSet changeSet = - Objects.requireNonNull(neighborChangeSetMap.get(selectedNeighborNode.getPrimaryKey())); - return pruneNeighborsIfNecessary(storageAdapter, transaction, - storageTransform, estimator, selectedNeighbor, layer, - currentMMax, changeSet, nodeCache) - .thenApply(nodeReferencesAndNodes -> { - if (nodeReferencesAndNodes == null) { - return changeSet; - } - return resolveChangeSetFromNewNeighbors(changeSet, nodeReferencesAndNodes); - }); - }, getConfig().getMaxNumConcurrentNeighborhoodFetches(), getExecutor()) + selectedNeighbor -> { + final AbstractNode selectedNeighborNode = selectedNeighbor.getNode(); + final NeighborsChangeSet changeSet = + Objects.requireNonNull(neighborChangeSetMap.get(selectedNeighborNode.getPrimaryKey())); + return pruneNeighborsIfNecessary(storageAdapter, transaction, + storageTransform, estimator, selectedNeighbor, layer, + currentMMax, changeSet, nodeCache) + .thenApply(nodeReferencesAndNodes -> { + if (nodeReferencesAndNodes == null) { + return changeSet; + } + return resolveChangeSetFromNewNeighbors(changeSet, nodeReferencesAndNodes); + }); + }, getConfig().getMaxNumConcurrentNeighborhoodFetches(), getExecutor()) .thenApply(changeSets -> { for (int i = 0; i < selectedNeighbors.size(); i++) { - final NodeReferenceAndNode selectedNeighbor = selectedNeighbors.get(i); + final NodeReferenceAndNode selectedNeighbor = + selectedNeighbors.get(i); final NeighborsChangeSet changeSet = changeSets.get(i); storageAdapter.writeNode(transaction, quantizer, selectedNeighbor.getNode(), layer, changeSet); @@ -1134,15 +1149,15 @@ layer, getConfig().getM(), getConfig().isExtendCandidates(), nodeCache, newVecto */ private NeighborsChangeSet resolveChangeSetFromNewNeighbors(@Nonnull final NeighborsChangeSet beforeChangeSet, - @Nonnull final Iterable> afterNeighbors) { + @Nonnull final Iterable> afterNeighbors) { final Map beforeNeighborsMap = Maps.newLinkedHashMap(); for (final N n : beforeChangeSet.merge()) { beforeNeighborsMap.put(n.getPrimaryKey(), n); } final Map afterNeighborsMap = Maps.newLinkedHashMap(); - for (final NodeReferenceAndNode nodeReferenceAndNode : afterNeighbors) { - final NodeReferenceWithDistance nodeReferenceWithDistance = nodeReferenceAndNode.getNodeReferenceWithDistance(); + for (final NodeReferenceAndNode nodeReferenceAndNode : afterNeighbors) { + final NodeReferenceWithDistance nodeReferenceWithDistance = nodeReferenceAndNode.getNodeReference(); afterNeighborsMap.put(nodeReferenceWithDistance.getPrimaryKey(), nodeReferenceAndNode.getNode().getSelfReference(nodeReferenceWithDistance.getVector())); @@ -1200,12 +1215,12 @@ layer, getConfig().getM(), getConfig().isExtendCandidates(), nodeCache, newVecto * If no pruning was necessary, it completes with {@code null}. */ @Nonnull - private CompletableFuture>> + private CompletableFuture>> pruneNeighborsIfNecessary(@Nonnull final StorageAdapter storageAdapter, @Nonnull final Transaction transaction, @Nonnull final AffineOperator storageTransform, @Nonnull final Estimator estimator, - @Nonnull final NodeReferenceAndNode selectedNeighbor, + @Nonnull final NodeReferenceAndNode selectedNeighbor, final int layer, final int mMax, @Nonnull final NeighborsChangeSet neighborChangeSet, @@ -1220,27 +1235,25 @@ layer, getConfig().getM(), getConfig().isExtendCandidates(), nodeCache, newVecto logger.trace("pruning neighborhood of key={} which has numNeighbors={} out of mMax={}", selectedNeighborNode.getPrimaryKey(), numNeighbors, mMax); } - return fetchNeighborhood(storageAdapter, transaction, storageTransform, layer, neighborChangeSet.merge(), nodeCache) - .thenCompose(nodeReferenceWithVectors -> { + return fetchNeighborhoodReferences(storageAdapter, transaction, storageTransform, layer, neighborChangeSet.merge(), nodeCache) + .thenApply(nodeReferenceWithVectors -> { final ImmutableList.Builder nodeReferencesWithDistancesBuilder = ImmutableList.builder(); for (final NodeReferenceWithVector nodeReferenceWithVector : nodeReferenceWithVectors) { final var vector = nodeReferenceWithVector.getVector(); final double distance = estimator.distance(vector, - selectedNeighbor.getNodeReferenceWithDistance().getVector()); + selectedNeighbor.getNodeReference().getVector()); nodeReferencesWithDistancesBuilder.add( new NodeReferenceWithDistance(nodeReferenceWithVector.getPrimaryKey(), vector, distance)); } - return fetchSomeNodesIfNotCached(storageAdapter, transaction, storageTransform, layer, - nodeReferencesWithDistancesBuilder.build(), nodeCache); + return nodeReferencesWithDistancesBuilder.build(); }) .thenCompose(nodeReferencesAndNodes -> selectNeighbors(storageAdapter, transaction, storageTransform, estimator, nodeReferencesAndNodes, layer, - mMax, false, nodeCache, - selectedNeighbor.getNodeReferenceWithDistance().getVector())); + mMax, nodeCache)); } } @@ -1266,80 +1279,72 @@ layer, getConfig().getM(), getConfig().isExtendCandidates(), nodeCache, newVecto * @param estimator the estimator in use * @param storageTransform an affine transformation operator that is used to transform the fetched vector into the * storage space that is currently being used - * @param nearestNeighbors the initial pool of candidate neighbors, typically from a search in a higher layer + * @param initialCandidates the initial pool of candidate neighbors, typically from a search in a higher layer * @param layer the layer in the HNSW graph where the selection is being performed * @param m the maximum number of neighbors to select - * @param isExtendCandidates a flag indicating whether to extend the initial candidate pool by fetching the * neighbors of the {@code nearestNeighbors} * @param nodeCache a cache of nodes to avoid redundant storage lookups - * @param vector the query vector for which neighbors are being selected * * @return a {@link CompletableFuture} which will complete with a list of the selected neighbors, * each represented as a {@link NodeReferenceAndNode} */ - private CompletableFuture>> + private CompletableFuture>> selectNeighbors(@Nonnull final StorageAdapter storageAdapter, @Nonnull final ReadTransaction readTransaction, @Nonnull final AffineOperator storageTransform, @Nonnull final Estimator estimator, - @Nonnull final Iterable> nearestNeighbors, + @Nonnull final Iterable initialCandidates, final int layer, final int m, - final boolean isExtendCandidates, - @Nonnull final Map> nodeCache, - @Nonnull final Transformed vector) { + @Nonnull final Map> nodeCache) { final Metric metric = getConfig().getMetric(); - return extendCandidatesIfNecessary(storageAdapter, readTransaction, storageTransform, estimator, - nearestNeighbors, layer, isExtendCandidates, nodeCache, vector) - .thenApply(extendedCandidates -> { - final List selected = Lists.newArrayListWithExpectedSize(m); - final Queue candidates = - new PriorityQueue<>(extendedCandidates.size(), - Comparator.comparing(NodeReferenceWithDistance::getDistance)); - candidates.addAll(extendedCandidates); - final Queue discardedCandidates = - getConfig().isKeepPrunedConnections() - ? new PriorityQueue<>(config.getM(), - Comparator.comparing(NodeReferenceWithDistance::getDistance)) - : null; - - while (!candidates.isEmpty() && selected.size() < m) { - final NodeReferenceWithDistance nearestCandidate = candidates.poll(); - boolean shouldSelect = true; - // if the metric does not support triangle inequality, we shold not use the heuristic - if (metric.satisfiesTriangleInequality()) { - for (final NodeReferenceWithDistance alreadySelected : selected) { - if (estimator.distance(nearestCandidate.getVector(), - alreadySelected.getVector()) < nearestCandidate.getDistance()) { - shouldSelect = false; - break; - } - } - } - if (shouldSelect) { - selected.add(nearestCandidate); - } else if (discardedCandidates != null) { - discardedCandidates.add(nearestCandidate); - } - } - if (discardedCandidates != null) { // isKeepPrunedConnections is set to true - while (!discardedCandidates.isEmpty() && selected.size() < m) { - selected.add(discardedCandidates.poll()); - } + final List selected = Lists.newArrayListWithExpectedSize(m); + final Queue candidates = + new PriorityQueue<>(getConfig().getM(), + Comparator.comparing(NodeReferenceWithDistance::getDistance)); + initialCandidates.forEach(candidates::add); + final Queue discardedCandidates = + getConfig().isKeepPrunedConnections() + ? new PriorityQueue<>(config.getM(), + Comparator.comparing(NodeReferenceWithDistance::getDistance)) + : null; + + while (!candidates.isEmpty() && selected.size() < m) { + final NodeReferenceWithDistance nearestCandidate = candidates.poll(); + boolean shouldSelect = true; + // if the metric does not support triangle inequality, we shold not use the heuristic + if (metric.satisfiesTriangleInequality()) { + for (final NodeReferenceWithDistance alreadySelected : selected) { + if (estimator.distance(nearestCandidate.getVector(), + alreadySelected.getVector()) < nearestCandidate.getDistance()) { + shouldSelect = false; + break; } + } + } + if (shouldSelect) { + selected.add(nearestCandidate); + } else if (discardedCandidates != null) { + discardedCandidates.add(nearestCandidate); + } + } - return ImmutableList.copyOf(selected); - }).thenCompose(selectedNeighbors -> - fetchSomeNodesIfNotCached(storageAdapter, readTransaction, storageTransform, layer, - selectedNeighbors, nodeCache)) + if (discardedCandidates != null) { // isKeepPrunedConnections is set to true + while (!discardedCandidates.isEmpty() && selected.size() < m) { + selected.add(discardedCandidates.poll()); + } + } + + return fetchSomeNodesIfNotCached(storageAdapter, readTransaction, storageTransform, layer, + selected, nodeCache) .thenApply(selectedNeighbors -> { if (logger.isTraceEnabled()) { logger.trace("selected neighbors={}", selectedNeighbors.stream() .map(selectedNeighbor -> - "(primaryKey=" + selectedNeighbor.getNodeReferenceWithDistance().getPrimaryKey() + - ",distance=" + selectedNeighbor.getNodeReferenceWithDistance().getDistance() + ")") + "(primaryKey=" + selectedNeighbor.getNodeReference().getPrimaryKey() + + ",distance=" + selectedNeighbor.getNodeReference().getDistance() + ")") .collect(Collectors.joining(","))); } return selectedNeighbors; @@ -1377,54 +1382,108 @@ layer, getConfig().getM(), getConfig().isExtendCandidates(), nodeCache, newVecto @Nonnull final ReadTransaction readTransaction, @Nonnull final AffineOperator storageTransform, @Nonnull final Estimator estimator, - @Nonnull final Iterable> candidates, - int layer, - boolean isExtendCandidates, + @Nonnull final Iterable> candidates, + final int layer, + final boolean isExtendCandidates, @Nonnull final Map> nodeCache, @Nonnull final Transformed vector) { - if (isExtendCandidates) { - final Set candidatesSeen = Sets.newConcurrentHashSet(); - for (final NodeReferenceAndNode candidate : candidates) { - candidatesSeen.add(candidate.getNode().getPrimaryKey()); - } - final ImmutableList.Builder neighborsOfCandidatesBuilder = ImmutableList.builder(); - for (final NodeReferenceAndNode candidate : candidates) { - for (final N neighbor : candidate.getNode().getNeighbors()) { - final Tuple neighborPrimaryKey = neighbor.getPrimaryKey(); - if (!candidatesSeen.contains(neighborPrimaryKey)) { - candidatesSeen.add(neighborPrimaryKey); - neighborsOfCandidatesBuilder.add(neighbor); - } - } - } - - final Iterable neighborsOfCandidates = neighborsOfCandidatesBuilder.build(); - - return fetchNeighborhood(storageAdapter, readTransaction, storageTransform, layer, - neighborsOfCandidates, nodeCache) - .thenApply(withVectors -> { - final ImmutableList.Builder extendedCandidatesBuilder = - ImmutableList.builder(); - for (final NodeReferenceAndNode candidate : candidates) { - extendedCandidatesBuilder.add(candidate.getNodeReferenceWithDistance()); - } + // + // Add all given candidates to the result. + // + final ImmutableList.Builder resultBuilder = ImmutableList.builder(); + for (final NodeReferenceAndNode candidate : candidates) { + resultBuilder.add(candidate.getNodeReference()); + } - for (final NodeReferenceWithVector withVector : withVectors) { - final double distance = estimator.distance(vector, withVector.getVector()); - extendedCandidatesBuilder.add(new NodeReferenceWithDistance(withVector.getPrimaryKey(), - withVector.getVector(), distance)); + if (isExtendCandidates) { + return neighbors(storageAdapter, readTransaction, storageTransform, candidates, + HopMode.INCLUSIVE, layer, nodeCache) + .thenApply(hop2 -> { + for (final NodeReferenceWithVector nodeReferenceWithVector : hop2) { + final double distance = estimator.distance(nodeReferenceWithVector.getVector(), vector); + resultBuilder.add(new NodeReferenceWithDistance(nodeReferenceWithVector.getPrimaryKey(), + nodeReferenceWithVector.getVector(), distance)); } - return extendedCandidatesBuilder.build(); + return resultBuilder.build(); }); } else { - final ImmutableList.Builder resultBuilder = ImmutableList.builder(); - for (final NodeReferenceAndNode candidate : candidates) { - resultBuilder.add(candidate.getNodeReferenceWithDistance()); + return CompletableFuture.completedFuture(resultBuilder.build()); + } + } + + /** + * Compute and if necessary fetch the neighbors of an iterable of initial nodes that is passed in. Hop is defined as the + * set of all nodes that are neighbors of the initial nodes. Note that the neighbor of an initial node might + * be another initial node. If that is the case the node is returned. If that is not desired by the caller, the + * caller needs to remove those nodes via a subtraction of the initial set. + * + * @param the type of the {@link NodeReference} + * @param storageAdapter the {@link StorageAdapter} used to access node data from storage + * @param readTransaction the active {@link ReadTransaction} for database access + * @param storageTransform an affine transformation operator that is used to transform the fetched vector into the + * storage space that is currently being used + * @param initialNodeReferenceAndNodes an {@link Iterable} of initial candidate nodes, which have already been evaluated + * @param layer the graph layer from which to fetch nodes + * @param nodeCache a cache mapping primary keys to {@link AbstractNode} objects to avoid redundant fetches + * + * @return a {@link CompletableFuture} which will complete with a set of {@link NodeReferenceWithDistance} + */ + private CompletableFuture> + neighbors(@Nonnull final StorageAdapter storageAdapter, + @Nonnull final ReadTransaction readTransaction, + @Nonnull final AffineOperator storageTransform, + @Nonnull final Iterable> initialNodeReferenceAndNodes, + @Nonnull final HopMode hopMode, + final int layer, + @Nonnull + final Predicate samplingPredicate, + @Nonnull final Map> nodeCache) { + final ImmutableSet.Builder resultBuilder = ImmutableSet.builder(); + final ImmutableMap.Builder> initialNodesMapBuilder = ImmutableMap.builder(); + for (final NodeReferenceAndNode nodeReferenceAndNode : initialNodeReferenceAndNodes) { + initialNodesMapBuilder.put(nodeReferenceAndNode.getNodeReference().getPrimaryKey(), + nodeReferenceAndNode); + if (hopMode == HopMode.INCLUSIVE) { + resultBuilder.add(nodeReferenceAndNode.getNodeReference()); } + } + final ImmutableMap> initialNodesMap = initialNodesMapBuilder.build(); - return CompletableFuture.completedFuture(resultBuilder.build()); + final Set nodeReferencesSeen = Sets.newHashSet(); + + final ImmutableList.Builder toBeFetchedBuilder = ImmutableList.builder(); + for (final NodeReferenceAndNode nodeReferenceAndNode : initialNodeReferenceAndNodes) { + for (final N neighbor : nodeReferenceAndNode.getNode().getNeighbors()) { + final Tuple neighborPrimaryKey = neighbor.getPrimaryKey(); + @Nullable final NodeReferenceAndNode initialNode = initialNodesMap.get(neighborPrimaryKey); + if (initialNode != null) { + // + // This is an initial node which happens to be a neighbor of another initial node. We already have + // everything we need to put this node into the result without fetching it. + // + if (hopMode != HopMode.EXCLUSIVE) { + resultBuilder.add(initialNode.getNodeReference()); + } + } else if (!nodeReferencesSeen.contains(neighborPrimaryKey)) { + // + // This is a node that is currently not known to us. It is not an initial node. We need to fetch it, + // and we need to mark it as seen so we won't consider it more than once. + // + + // TODO use the sampling predicate to determine if we want to fetch this neighbor or not + toBeFetchedBuilder.add(neighbor); + nodeReferencesSeen.add(neighborPrimaryKey); + } + } } + final Iterable toBeFetched = toBeFetchedBuilder.build(); + + return fetchNeighborhoodReferences(storageAdapter, readTransaction, storageTransform, layer, toBeFetched, nodeCache) + .thenApply(withVectors -> { + resultBuilder.addAll(withVectors); + return resultBuilder.build(); + }); } /** @@ -1486,6 +1545,227 @@ private void writeLonelyNodeOnLayer(@Nonnull final Qua } } + /** + * Deletes a vector with its associated primary key from the HNSW graph. + *

+ * The method first determines a random layer for the new node, called the {@code top layer}. It then applies a + * deletion algorithm to all layers from {@code 0} to including the {@code top layer} that removes the record from + * the index and locally repairs the relationships between nearby other vectors that were affected by the delete + * operation. + * + * @param transaction the {@link Transaction} context for all database operations + * @param primaryKey the unique {@link Tuple} primary key for the new node being inserted + * + * @return a {@link CompletableFuture} that completes when the insertion operation is finished + */ + @Nonnull + public CompletableFuture delete(@Nonnull final Transaction transaction, @Nonnull final Tuple primaryKey) { + final int topLayer = topLayer(primaryKey); + if (logger.isTraceEnabled()) { + logger.trace("new node with key={} selected to be deleted form layer={}", primaryKey, topLayer); + } + + return StorageAdapter.fetchAccessInfo(getConfig(), transaction, getSubspace(), getOnReadListener()) + .thenCombine(exists(transaction, primaryKey), + (accessInfo, nodeAlreadyExists) -> { + if (!nodeAlreadyExists) { + if (logger.isDebugEnabled()) { + logger.debug("record does not exists in HNSW with key={} on layer={}", + primaryKey, topLayer); + } + } + return new AccessInfoAndNodeExistence(accessInfo, nodeAlreadyExists); + }) + .thenCompose(accessInfoAndNodeExistence -> { + if (!accessInfoAndNodeExistence.isNodeExists()) { + return AsyncUtil.DONE; + } + + final AccessInfo accessInfo = accessInfoAndNodeExistence.getAccessInfo(); + final AffineOperator storageTransform = storageTransform(accessInfo); + final Transformed transformedNewVector = storageTransform.transform(newVector); + final Quantizer quantizer = quantizer(accessInfo); + final Estimator estimator = quantizer.estimator(); + + final AccessInfo currentAccessInfo; + if (accessInfo == null) { + // this is the first node + writeLonelyNodes(quantizer, transaction, primaryKey, transformedNewVector, + topLayer, -1); + currentAccessInfo = new AccessInfo( + new EntryNodeReference(primaryKey, transformedNewVector, topLayer), + -1L, null); + StorageAdapter.writeAccessInfo(transaction, getSubspace(), currentAccessInfo, + getOnWriteListener()); + if (logger.isTraceEnabled()) { + logger.trace("written initial entry node reference with key={} on layer={}", + primaryKey, topLayer); + } + return AsyncUtil.DONE; + } else { + final EntryNodeReference entryNodeReference = accessInfo.getEntryNodeReference(); + final int lMax = entryNodeReference.getLayer(); + if (topLayer > lMax) { + writeLonelyNodes(quantizer, transaction, primaryKey, transformedNewVector, + topLayer, lMax); + currentAccessInfo = accessInfo.withNewEntryNodeReference( + new EntryNodeReference(primaryKey, transformedNewVector, + topLayer)); + StorageAdapter.writeAccessInfo(transaction, getSubspace(), currentAccessInfo, + getOnWriteListener()); + if (logger.isTraceEnabled()) { + logger.trace("written higher entry node reference with key={} on layer={}", + primaryKey, topLayer); + } + } else { + currentAccessInfo = accessInfo; + } + } + + final EntryNodeReference entryNodeReference = accessInfo.getEntryNodeReference(); + final int lMax = entryNodeReference.getLayer(); + if (logger.isTraceEnabled()) { + logger.trace("entry node read with key {} at layer {}", entryNodeReference.getPrimaryKey(), lMax); + } + + final NodeReferenceWithDistance initialNodeReference = + new NodeReferenceWithDistance(entryNodeReference.getPrimaryKey(), + entryNodeReference.getVector(), + estimator.distance(transformedNewVector, entryNodeReference.getVector())); + return forLoop(lMax, initialNodeReference, + layer -> layer > topLayer, + layer -> layer - 1, + (layer, previousNodeReference) -> { + final StorageAdapter storageAdapter = getStorageAdapterForLayer(layer); + return greedySearchLayer(storageAdapter, transaction, storageTransform, + estimator, previousNodeReference, layer, transformedNewVector); + }, executor) + .thenCompose(nodeReference -> + insertIntoLayers(transaction, storageTransform, quantizer, primaryKey, + transformedNewVector, nodeReference, lMax, topLayer)); + }).thenCompose(ignored -> AsyncUtil.DONE); + } + + /** + * Deletes a node from the HNSW graph across multiple layers, starting from a given top layer and entry point. + * + * @param transaction the transaction to use for database operations + * @param storageTransform an affine transformation operator that is used to transform the fetched vector into the + * storage space that is currently being used + * @param quantizer the quantizer to be used for this insert + * @param primaryKey the primary key of the new node being inserted + * @param nodeReference the initial entry point for the search, typically the nearest neighbor found in the highest + * layer + * @param topLayer the top layer for the node. + * + * @return a {@link CompletableFuture} that completes when the new node has been successfully inserted into all + * its designated layers + */ + @Nonnull + private CompletableFuture deleteFromLayers(@Nonnull final Transaction transaction, + @Nonnull final AffineOperator storageTransform, + @Nonnull final Quantizer quantizer, + @Nonnull final Tuple primaryKey, + @Nonnull final NodeReferenceWithDistance nodeReference, + final int topLayer) { + if (logger.isTraceEnabled()) { + logger.trace("nearest entry point for deleteFromLayers at topLayer={} is at key={}", topLayer, nodeReference.getPrimaryKey()); + } + + return MoreAsyncUtil.forEach(() -> IntStream.rangeClosed(0, topLayer).iterator(), + layer -> { + final StorageAdapter storageAdapter = getStorageAdapterForLayer(layer); + return deleteFromLayer(storageAdapter, transaction, storageTransform, quantizer, layer, primaryKey); + }, + getConfig().getMaxNumConcurrentNeighborhoodFetches(), + executor).thenCompose(ignored -> AsyncUtil.DONE); + } + + /** + * Deletes a node from a specified layer of the HNSW graph. + *

+ * This method orchestrates the complete deletion process for a single layer. + *

+ * + * @param the type of the node reference, extending {@link NodeReference} + * @param storageAdapter the storage adapter for reading from and writing to the graph + * @param transaction the transaction context for the database operations + * @param storageTransform an affine transformation operator that is used to transform the fetched vector into the + * storage space that is currently being used + * @param quantizer the quantizer for this insert + * @param layer the layer number to insert the new node into + * @param primaryKey the primary key of the new node to be inserted + * + * @return a {@code CompletableFuture} that completes with a {@code null} + */ + @Nonnull + private CompletableFuture + deleteFromLayer(@Nonnull final StorageAdapter storageAdapter, + @Nonnull final Transaction transaction, + @Nonnull final AffineOperator storageTransform, + @Nonnull final Quantizer quantizer, + final int layer, + @Nonnull final Tuple primaryKey) { + if (logger.isTraceEnabled()) { + logger.trace("begin delete key={} at layer={}", primaryKey, layer); + } + final Map> nodeCache = Maps.newConcurrentMap(); + final Estimator estimator = quantizer.estimator(); + + return storageAdapter.fetchNode(transaction, storageTransform, layer, primaryKey) + .thenCompose(toBeDeletedNode -> { + final NodeReferenceAndNode toBeDeletedNodeReferenceAndNode = + new NodeReferenceAndNode<>(new NodeReferenceWithVector(primaryKey, + toBeDeletedNode.asCompactNode().getVector()), toBeDeletedNode); + + return neighbors(storageAdapter, transaction, storageTransform, + ImmutableList.of(toBeDeletedNodeReferenceAndNode), HopMode.INCLUSIVE, layer, nodeCache) + .thenCompose(neighborsFirstDegree -> + fetchSomeNodesIfNotCached(storageAdapter, transaction, storageTransform, layer, + neighborsFirstDegree, nodeCache)) + .thenCompose(neighborsFirstDegree -> + neighbors(storageAdapter, transaction, storageTransform, neighborsFirstDegree, + HopMode.INCLUSIVE, layer, nodeCache)) + .thenCompose(neighborsSecondDegree -> + forEach(toBeDeletedNode.getNeighbors(), + neighborReference -> fetchNodeIfNotCached(storageAdapter, transaction, + storageTransform, layer, neighborReference, nodeCache) + .thenCompose(neighborNode -> { + final ImmutableSet.Builder candidatesBuilder = ImmutableSet.builder(); + for (final NodeReferenceWithVector nodeReferenceWithVector : neighborsSecondDegree) { + final double distance = + estimator.distance(nodeReferenceWithVector.getVector(), + storageAdapter.getVector(neighborReference, neighborNode)); + candidatesBuilder.add(new NodeReferenceWithDistance(nodeReferenceWithVector.getPrimaryKey(), + nodeReferenceWithVector.getVector(), distance)); + } + return repairOutNeighborNode(storageAdapter, transaction, + storageTransform, quantizer, candidatesBuilder.build(), layer, nodeCache); + }), + getConfig().getMaxNumConcurrentNeighborhoodFetches(), executor)) + .thenCompose(ignored -> AsyncUtil.DONE); + }).thenApply(result -> { + if (logger.isTraceEnabled()) { + logger.trace("end delete key={} at layer={}", primaryKey, layer); + } + return result; + }); + } + + private CompletableFuture + repairOutNeighborNode(@Nonnull final StorageAdapter storageAdapter, + @Nonnull final Transaction transaction, + @Nonnull final AffineOperator storageTransform, + @Nonnull final Quantizer quantizer, + @Nonnull final Set candidates, + final int layer, + final Map> nodeCache) { + final Estimator estimator = quantizer.estimator(); + + return selectNeighbors(storageAdapter, transaction, storageTransform, estimator, candidates, + layer, getConfig().getM(), nodeCache); + } + /** * Scans all nodes within a given layer of the database. *

@@ -1543,15 +1823,11 @@ private StorageAdapter getStorageAdapterForLayer(final @Nonnull private SplittableRandom random(@Nonnull final Tuple primaryKey) { - if (config.isDeterministicSeeding()) { - return new SplittableRandom(primaryKey.hashCode()); - } else { - return new SplittableRandom(System.nanoTime()); - } + return new SplittableRandom(splitMixLong(primaryKey.hashCode())); } /** - * Calculates a random layer for a new element to be inserted. + * Calculates a layer for a new element to be inserted or for an element to be deleted from. *

* The layer is selected according to a logarithmic distribution, which ensures that * the probability of choosing a higher layer decreases exponentially. This is @@ -1559,12 +1835,12 @@ private SplittableRandom random(@Nonnull final Tuple primaryKey) { * is {@code floor(-ln(u) * lambda)}, where {@code u} is a uniform random * number and {@code lambda} is a normalization factor derived from a system * configuration parameter {@code M}. - * @param random a random to use - * @return a non-negative integer representing the randomly selected layer. + * @param primaryKey the primary key of the record to be inserted/updated/deleted + * @return a non-negative integer representing the randomly selected layer */ - private int insertionLayer(@Nonnull final SplittableRandom random) { + private int topLayer(@Nonnull final Tuple primaryKey) { double lambda = 1.0 / Math.log(getConfig().getM()); - double u = 1.0 - random.nextDouble(); // Avoid log(0) + double u = 1.0 - splitMixDouble(primaryKey.hashCode()); // Avoid log(0) return (int) Math.floor(-Math.log(u) * lambda); } @@ -1576,6 +1852,18 @@ private boolean shouldMaintainStats(@Nonnull final SplittableRandom random) { return random.nextDouble() < getConfig().getMaintainStatsProbability(); } + private static double splitMixDouble(final long x) { + return (splitMixLong(x) >>> 11) * 0x1.0p-53; + } + + private static long splitMixLong(long x) { + x += 0x9e3779b97f4a7c15L; + x = (x ^ (x >>> 30)) * 0xbf58476d1ce4e5b9L; + x = (x ^ (x >>> 27)) * 0x94d049bb133111ebL; + x = x ^ (x >>> 31); + return x; + } + @Nonnull private static List drain(@Nonnull Queue queue) { final ImmutableList.Builder resultBuilder = ImmutableList.builder(); @@ -1585,6 +1873,25 @@ private static List drain(@Nonnull Queue queue) { return resultBuilder.build(); } + /** + * Let {@code I} be the set of initial nodes for {@link #neighbors}. Let {@code H(I)} be the set of nodes that can be + * reached by traversing the neighbors of nodes in {@code I}. + */ + private enum HopMode { + /** + * Return {@code I union H(I)}. + */ + INCLUSIVE, + /** + * Return {@code H(I) \ I}. + */ + EXCLUSIVE, + /** + * Return {@code H(I)}. + */ + EXCLUSIVE_ALL; + } + private static class AccessInfoAndNodeExistence { @Nullable private final AccessInfo accessInfo; diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InliningStorageAdapter.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InliningStorageAdapter.java index fce0fdac34..0f1516787d 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InliningStorageAdapter.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InliningStorageAdapter.java @@ -34,6 +34,7 @@ import com.apple.foundationdb.subspace.Subspace; import com.apple.foundationdb.tuple.ByteArrayUtil; import com.apple.foundationdb.tuple.Tuple; +import com.google.common.base.Verify; import com.google.common.collect.ImmutableList; import javax.annotation.Nonnull; @@ -75,6 +76,14 @@ public InliningStorageAdapter(@Nonnull final Config config, super(config, nodeFactory, subspace, onWriteListener, onReadListener); } + @Nonnull + @Override + public Transformed getVector(@Nonnull final NodeReferenceWithVector nodeReference, + @Nonnull final AbstractNode node) { + Verify.verify(nodeReference.isNodeReferenceWithVector()); + return nodeReference.asNodeReferenceWithVector().getVector(); + } + /** * Asynchronously fetches a single node from a given layer by its primary key. *

diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NodeReferenceAndNode.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NodeReferenceAndNode.java index a6c4f33abe..96c152fcc9 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NodeReferenceAndNode.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NodeReferenceAndNode.java @@ -33,9 +33,9 @@ * pieces of information together. * @param the type of {@link NodeReference} used within the {@link AbstractNode} */ -class NodeReferenceAndNode { +class NodeReferenceAndNode { @Nonnull - private final NodeReferenceWithDistance nodeReferenceWithDistance; + private final T nodeReferenceWithDistance; @Nonnull private final AbstractNode node; @@ -46,7 +46,7 @@ class NodeReferenceAndNode { * {@code null}. * @param node the actual {@link AbstractNode} object that the reference points to. Must not be {@code null}. */ - public NodeReferenceAndNode(@Nonnull final NodeReferenceWithDistance nodeReferenceWithDistance, + public NodeReferenceAndNode(@Nonnull final T nodeReferenceWithDistance, @Nonnull final AbstractNode node) { this.nodeReferenceWithDistance = nodeReferenceWithDistance; this.node = node; @@ -57,7 +57,7 @@ public NodeReferenceAndNode(@Nonnull final NodeReferenceWithDistance nodeReferen * @return the non-null {@link NodeReferenceWithDistance} object. */ @Nonnull - public NodeReferenceWithDistance getNodeReferenceWithDistance() { + public T getNodeReference() { return nodeReferenceWithDistance; } @@ -77,10 +77,10 @@ public AbstractNode getNode() { * @return a {@link List} of {@link NodeReferenceAndNode}s */ @Nonnull - public static List getReferences(@Nonnull List> referencesAndNodes) { - final ImmutableList.Builder referencesBuilder = ImmutableList.builder(); - for (final NodeReferenceAndNode referenceWithNode : referencesAndNodes) { - referencesBuilder.add(referenceWithNode.getNodeReferenceWithDistance()); + public static List getReferences(@Nonnull List> referencesAndNodes) { + final ImmutableList.Builder referencesBuilder = ImmutableList.builder(); + for (final NodeReferenceAndNode referenceWithNode : referencesAndNodes) { + referencesBuilder.add(referenceWithNode.getNodeReference()); } return referencesBuilder.build(); } diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageAdapter.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageAdapter.java index 4c7296ad45..de33e1c1be 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageAdapter.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageAdapter.java @@ -124,6 +124,9 @@ interface StorageAdapter { @Nonnull OnReadListener getOnReadListener(); + @Nonnull + Transformed getVector(@Nonnull N nodeReference, @Nonnull AbstractNode node); + /** * Asynchronously fetches a node from a specific layer, identified by its primary key. *

diff --git a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/ConfigTest.java b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/ConfigTest.java index 141df55cfe..f6319dae14 100644 --- a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/ConfigTest.java +++ b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/ConfigTest.java @@ -32,7 +32,6 @@ void testConfig() { Assertions.assertThat(HNSW.newConfigBuilder().build(768)).isEqualTo(defaultConfig); Assertions.assertThat(defaultConfig.toBuilder().build(768)).isEqualTo(defaultConfig); - final boolean deterministicSeeding = true; final Metric metric = Metric.COSINE_METRIC; final boolean useInlining = true; final int m = Config.DEFAULT_M + 1; @@ -51,7 +50,6 @@ void testConfig() { final int maxNumConcurrentNodeFetches = 1; final int maxNumConcurrentNeighborhoodFetches = 2; - Assertions.assertThat(defaultConfig.isDeterministicSeeding()).isNotEqualTo(deterministicSeeding); Assertions.assertThat(defaultConfig.getMetric()).isNotSameAs(metric); Assertions.assertThat(defaultConfig.isUseInlining()).isNotEqualTo(useInlining); Assertions.assertThat(defaultConfig.getM()).isNotEqualTo(m); @@ -73,7 +71,6 @@ void testConfig() { final Config newConfig = defaultConfig.toBuilder() - .setDeterministicSeeding(deterministicSeeding) .setMetric(metric) .setUseInlining(useInlining) .setM(m) @@ -91,7 +88,6 @@ void testConfig() { .setMaxNumConcurrentNeighborhoodFetches(maxNumConcurrentNeighborhoodFetches) .build(768); - Assertions.assertThat(newConfig.isDeterministicSeeding()).isEqualTo(deterministicSeeding); Assertions.assertThat(newConfig.getMetric()).isSameAs(metric); Assertions.assertThat(newConfig.isUseInlining()).isEqualTo(useInlining); Assertions.assertThat(newConfig.getM()).isEqualTo(m); diff --git a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java index 726a38902f..07538f8e20 100644 --- a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java +++ b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java @@ -210,7 +210,6 @@ void testBasicInsert(final long seed, final boolean useInlining, final boolean e final int numDimensions = 128; final HNSW hnsw = new HNSW(rtSubspace.getSubspace(), TestExecutors.defaultThreadPool(), HNSW.newConfigBuilder() - .setDeterministicSeeding(true) .setMetric(metric) .setUseInlining(useInlining) .setExtendCandidates(extendCandidates) @@ -314,7 +313,6 @@ void testBasicInsertWithRaBitQEncodings(final long seed) { final int numDimensions = 128; final HNSW hnsw = new HNSW(rtSubspace.getSubspace(), TestExecutors.defaultThreadPool(), HNSW.newConfigBuilder() - .setDeterministicSeeding(true) .setMetric(metric) .setUseRaBitQ(true) .setRaBitQNumExBits(5) @@ -436,7 +434,6 @@ void testSIFTInsertSmall() throws Exception { final HNSW hnsw = new HNSW(rtSubspace.getSubspace(), TestExecutors.defaultThreadPool(), HNSW.newConfigBuilder() - .setDeterministicSeeding(false) .setUseRaBitQ(true) .setRaBitQNumExBits(5) .setMetric(metric) diff --git a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/metadata/IndexOptions.java b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/metadata/IndexOptions.java index a393debff4..950f440f96 100644 --- a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/metadata/IndexOptions.java +++ b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/metadata/IndexOptions.java @@ -224,13 +224,6 @@ public class IndexOptions { */ public static final String RTREE_USE_NODE_SLOT_INDEX = "rtreeUseNodeSlotIndex"; - /** - * HNSW-only: The seeding method that is used to see the PRNG that is in turn used to probabilistically determine - * the highest layer of an insert into an HNSW structure. See {@link Config#isDeterministicSeeding()}. The default - * random seed is {@link Config#DEFAULT_DETERMINISTIC_SEEDING}. - */ - public static final String HNSW_DETERMINISTIC_SEEDING = "hnswDeterministicSeeding"; - /** * HNSW-only: The metric that is used to determine distances between vectors. The default metric is * {@link Config#DEFAULT_METRIC}. See {@link Config#getMetric()}. diff --git a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexHelper.java b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexHelper.java index a0be8a86c0..845acb9e9f 100644 --- a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexHelper.java +++ b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexHelper.java @@ -49,10 +49,6 @@ private VectorIndexHelper() { @Nonnull public static Config getConfig(@Nonnull final Index index) { final ConfigBuilder builder = HNSW.newConfigBuilder(); - final String hnswRandomSeedOption = index.getOption(IndexOptions.HNSW_DETERMINISTIC_SEEDING); - if (hnswRandomSeedOption != null) { - builder.setDeterministicSeeding(Boolean.parseBoolean(hnswRandomSeedOption)); - } final String hnswMetricOption = index.getOption(IndexOptions.HNSW_METRIC); if (hnswMetricOption != null) { builder.setMetric(Metric.valueOf(hnswMetricOption)); diff --git a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexMaintainerFactory.java b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexMaintainerFactory.java index 5e164ba943..c5f66c381d 100644 --- a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexMaintainerFactory.java +++ b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexMaintainerFactory.java @@ -137,8 +137,6 @@ public void validateChangedOptions(@Nonnull final Index oldIndex, final Config newOptions = VectorIndexHelper.getConfig(index); // do not allow changing any of the following - disallowChange(changedOptions, IndexOptions.HNSW_DETERMINISTIC_SEEDING, - oldOptions, newOptions, Config::isDeterministicSeeding); disallowChange(changedOptions, IndexOptions.HNSW_METRIC, oldOptions, newOptions, Config::getMetric); disallowChange(changedOptions, IndexOptions.HNSW_NUM_DIMENSIONS, diff --git a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTest.java b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTest.java index 65132ad37c..d75bd8d0c1 100644 --- a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTest.java +++ b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTest.java @@ -304,7 +304,6 @@ void directIndexValidatorTest() throws Exception { validateIndexEvolution(metaDataValidator, index, ImmutableMap.builder() // cannot change those per se but must accept same value - .put(IndexOptions.HNSW_DETERMINISTIC_SEEDING, "false") .put(IndexOptions.HNSW_METRIC, Metric.EUCLIDEAN_METRIC.name()) .put(IndexOptions.HNSW_NUM_DIMENSIONS, "128") .put(IndexOptions.HNSW_USE_INLINING, "false") @@ -324,10 +323,6 @@ void directIndexValidatorTest() throws Exception { .put(IndexOptions.HNSW_MAX_NUM_CONCURRENT_NODE_FETCHES, "17") .put(IndexOptions.HNSW_MAX_NUM_CONCURRENT_NEIGHBORHOOD_FETCHES, "9").build()); - Assertions.assertThatThrownBy(() -> validateIndexEvolution(metaDataValidator, index, - ImmutableMap.of(IndexOptions.HNSW_NUM_DIMENSIONS, "128", - IndexOptions.HNSW_DETERMINISTIC_SEEDING, "true"))).isInstanceOf(MetaDataException.class); - Assertions.assertThatThrownBy(() -> validateIndexEvolution(metaDataValidator, index, ImmutableMap.of(IndexOptions.HNSW_NUM_DIMENSIONS, "128", IndexOptions.HNSW_METRIC, Metric.EUCLIDEAN_SQUARE_METRIC.name()))) From 717b01e28158a05530ee592a331e6fe2e3049e91 Mon Sep 17 00:00:00 2001 From: Normen Seemann Date: Thu, 4 Dec 2025 18:07:31 +0100 Subject: [PATCH 02/17] delete code path -- delete from layer is almost code complete --- .../async/hnsw/AbstractStorageAdapter.java | 20 +- .../async/hnsw/CompactStorageAdapter.java | 35 +- .../apple/foundationdb/async/hnsw/HNSW.java | 443 +++++++++++++----- .../async/hnsw/InliningStorageAdapter.java | 12 +- .../async/hnsw/OnWriteListener.java | 13 + .../async/hnsw/StorageAdapter.java | 19 +- .../foundationdb/async/hnsw/HNSWTest.java | 2 +- 7 files changed, 418 insertions(+), 126 deletions(-) diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/AbstractStorageAdapter.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/AbstractStorageAdapter.java index 84e7db99ab..3b1952373d 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/AbstractStorageAdapter.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/AbstractStorageAdapter.java @@ -200,16 +200,16 @@ private > T checkNode(@Nullable final T node) { * * @param transaction the non-null {@link Transaction} context for this write operation * @param quantizer the quantizer to use - * @param node the non-null {@link Node} to be written to storage * @param layer the layer index where the node is being written + * @param node the non-null {@link Node} to be written to storage * @param changeSet the non-null {@link NeighborsChangeSet} detailing the modifications * to the node's neighbors */ @Override public void writeNode(@Nonnull final Transaction transaction, @Nonnull final Quantizer quantizer, - @Nonnull final AbstractNode node, final int layer, + final int layer, @Nonnull final AbstractNode node, @Nonnull final NeighborsChangeSet changeSet) { - writeNodeInternal(transaction, quantizer, node, layer, changeSet); + writeNodeInternal(transaction, quantizer, layer, node, changeSet); if (logger.isTraceEnabled()) { logger.trace("written node with key={} at layer={}", node.getPrimaryKey(), layer); } @@ -225,12 +225,22 @@ public void writeNode(@Nonnull final Transaction transaction, @Nonnull final Qua * * @param transaction the non-null transaction context for the write operation * @param quantizer the quantizer to use - * @param node the non-null {@link Node} to write * @param layer the layer or level of the node in the structure + * @param node the non-null {@link Node} to write * @param changeSet the non-null {@link NeighborsChangeSet} detailing additions or * removals of neighbor links */ protected abstract void writeNodeInternal(@Nonnull Transaction transaction, @Nonnull Quantizer quantizer, - @Nonnull AbstractNode node, int layer, + int layer, @Nonnull AbstractNode node, @Nonnull NeighborsChangeSet changeSet); + + @Override + public void deleteNode(@Nonnull final Transaction transaction, final int layer, @Nonnull final Tuple primaryKey) { + deleteNodeInternal(transaction, layer, primaryKey); + if (logger.isTraceEnabled()) { + logger.trace("deleted node with key={} at layer={}", primaryKey, layer); + } + } + + protected abstract void deleteNodeInternal(@Nonnull final Transaction transaction, final int layer, @Nonnull final Tuple primaryKey); } diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/CompactStorageAdapter.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/CompactStorageAdapter.java index 5013d3a648..aa4012158d 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/CompactStorageAdapter.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/CompactStorageAdapter.java @@ -103,8 +103,7 @@ protected CompletableFuture> fetchNodeInternal(@Nonn @Nonnull final AffineOperator storageTransform, final int layer, @Nonnull final Tuple primaryKey) { - final byte[] keyBytes = getDataSubspace().pack(Tuple.from(layer, primaryKey)); - + final byte[] keyBytes = getNodeKey(layer, primaryKey); return readTransaction.get(keyBytes) .thenApply(valueBytes -> { if (valueBytes == null) { @@ -222,16 +221,16 @@ private AbstractNode compactNodeFromTuples(@Nonnull final AffineO * * @param transaction the {@link Transaction} to use for the write operation. * @param quantizer the quantizer to use - * @param node the {@link AbstractNode} to be serialized and written; it is processed as a {@link CompactNode}. * @param layer the graph layer index for the node, used to construct the storage key. + * @param node the {@link AbstractNode} to be serialized and written; it is processed as a {@link CompactNode}. * @param neighborsChangeSet a {@link NeighborsChangeSet} containing the additions and removals, which are * merged to determine the final set of neighbors to be written. */ @Override public void writeNodeInternal(@Nonnull final Transaction transaction, @Nonnull final Quantizer quantizer, - @Nonnull final AbstractNode node, final int layer, + final int layer, @Nonnull final AbstractNode node, @Nonnull final NeighborsChangeSet neighborsChangeSet) { - final byte[] key = getDataSubspace().pack(Tuple.from(layer, node.getPrimaryKey())); + final byte[] key = getNodeKey(layer, node.getPrimaryKey()); final List nodeItems = Lists.newArrayListWithExpectedSize(3); nodeItems.add(NodeKind.COMPACT.getSerialized()); @@ -260,6 +259,32 @@ public void writeNodeInternal(@Nonnull final Transaction transaction, @Nonnull f } } + @Override + protected void deleteNodeInternal(@Nonnull final Transaction transaction, final int layer, + @Nonnull final Tuple primaryKey) { + final byte[] key = getNodeKey(layer, primaryKey); + transaction.clear(key); + getOnWriteListener().onNodeDeleted(layer, primaryKey); + } + + /** + * Constructs the raw database key for a node based on its layer and primary key. + *

+ * This key is created by packing a tuple containing the specified {@code layer} and the node's {@code primaryKey} + * within the data subspace. The resulting byte array is suitable for use in direct database lookups and preserves + * the sort order of the components. + * + * @param layer the layer index where the node resides + * @param primaryKey the primary key that uniquely identifies the node within its layer, + * encapsulated in a {@link Tuple} + * + * @return a byte array representing the packed key for the specified node + */ + @Nonnull + private byte[] getNodeKey(final int layer, @Nonnull final Tuple primaryKey) { + return getDataSubspace().pack(Tuple.from(layer, primaryKey)); + } + /** * Scans a given layer for nodes, returning an iterable over the results. *

diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java index 2565d0e2ef..bb7a7a48a5 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java @@ -65,7 +65,6 @@ import java.util.function.BiFunction; import java.util.function.Consumer; import java.util.function.Function; -import java.util.function.Predicate; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -497,6 +496,27 @@ private Quantizer quantizer(@Nullable final AccessInfo accessInfo) { }); } + /** + * Gets a node from the cache or throws an exception. + * + * @param the type of the node reference, which must extend {@link NodeReference} + * @param primaryKey the {@link Tuple} representing the primary key of the node + * @param nodeCache the cache to check for the node + * + * @return a {@link CompletableFuture} that will be completed with the cached {@link AbstractNode} + * @throws IllegalArgumentException if the node is not already present in the cache + */ + @Nonnull + private AbstractNode + nodeFromCache(@Nonnull final Tuple primaryKey, + @Nonnull final Map> nodeCache) { + final AbstractNode nodeFromCache = nodeCache.get(primaryKey); + if (nodeFromCache == null) { + throw new IllegalStateException("node should already have been fetched: " + primaryKey); + } + return nodeFromCache; + } + /** * Asynchronously fetches a node if it is not already present in the cache. *

@@ -629,8 +649,9 @@ private Quantizer quantizer(@Nullable final AccessInfo accessInfo) { }, (neighborReference, neighborNode) -> { // - // At this point we know that the node needed to be fetched which excludes INLINING nodes - // as they never have to be fetched. Therefore, we can safely treat the nodes as compact nodes. + // At this point we know that the node needed to be fetched, which means this branch cannot be + // reached for INLINING nodes as they never have to be fetched. Therefore, we can safely treat + // the nodes as compact nodes. // nodeCache.put(neighborReference.getPrimaryKey(), neighborNode); return new NodeReferenceWithVector(neighborReference.getPrimaryKey(), @@ -678,9 +699,9 @@ private Quantizer quantizer(@Nullable final AccessInfo accessInfo) { } return new NodeReferenceAndNode<>(nodeReference, node); }, - (nodeReferenceWithDistance, node) -> { - nodeCache.put(nodeReferenceWithDistance.getPrimaryKey(), node); - return new NodeReferenceAndNode<>(nodeReferenceWithDistance, node); + (nodeReference, node) -> { + nodeCache.put(nodeReference.getPrimaryKey(), node); + return new NodeReferenceAndNode<>(nodeReference, node); }); } @@ -1058,7 +1079,7 @@ private CompletableFuture insertIntoLayers(@Nonnull final Transaction tran return extendCandidatesIfNecessary(storageAdapter, transaction, storageTransform, estimator, searchResult, layer, getConfig().isExtendCandidates(), nodeCache, newVector) .thenCompose(extendedCandidates -> - selectNeighbors(storageAdapter, transaction, storageTransform, estimator, + selectCandidates(storageAdapter, transaction, storageTransform, estimator, extendedCandidates, layer, getConfig().getM(), nodeCache)) .thenCompose(selectedNeighbors -> { final NodeFactory nodeFactory = storageAdapter.getNodeFactory(); @@ -1072,7 +1093,7 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) new BaseNeighborsChangeSet<>(ImmutableList.of()), newNode.getNeighbors()); - storageAdapter.writeNode(transaction, quantizer, newNode, layer, + storageAdapter.writeNode(transaction, quantizer, layer, newNode, newNodeChangeSet); // create change sets for each selected neighbor and insert new node into them @@ -1089,17 +1110,18 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) insertSet); } - final int currentMMax = layer == 0 - ? getConfig().getMMax0() - : getConfig().getMMax(); + final int currentMMax = + layer == 0 ? getConfig().getMMax0() : getConfig().getMMax(); return forEach(selectedNeighbors, selectedNeighbor -> { + final NodeReferenceWithDistance selectedNeighborReference = + selectedNeighbor.getNodeReference(); final AbstractNode selectedNeighborNode = selectedNeighbor.getNode(); final NeighborsChangeSet changeSet = Objects.requireNonNull(neighborChangeSetMap.get(selectedNeighborNode.getPrimaryKey())); return pruneNeighborsIfNecessary(storageAdapter, transaction, - storageTransform, estimator, selectedNeighbor, layer, + storageTransform, estimator, layer, selectedNeighborReference, currentMMax, changeSet, nodeCache) .thenApply(nodeReferencesAndNodes -> { if (nodeReferencesAndNodes == null) { @@ -1114,7 +1136,7 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) selectedNeighbors.get(i); final NeighborsChangeSet changeSet = changeSets.get(i); storageAdapter.writeNode(transaction, quantizer, - selectedNeighbor.getNode(), layer, changeSet); + layer, selectedNeighbor.getNode(), changeSet); } return ImmutableList.copyOf(references); }); @@ -1204,11 +1226,11 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) * @param estimator an estimator to estimate distances * @param storageTransform an affine transformation operator that is used to transform the fetched vector into the * storage space that is currently being used - * @param selectedNeighbor the node whose neighborhood is being considered for pruning + * @param nodeReferenceWithVector the node reference of the node whose neighborhood is being considered for pruning * @param layer the graph layer on which the operation is performed * @param mMax the maximum number of neighbors a node is allowed to have on this layer * @param neighborChangeSet a set of pending changes to the neighborhood that must be included in the pruning - * calculation + * calculation * @param nodeCache a cache of nodes to avoid redundant database fetches * * @return a {@link CompletableFuture} which completes with a list of the newly selected neighbors for the pruned node. @@ -1220,12 +1242,11 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) @Nonnull final Transaction transaction, @Nonnull final AffineOperator storageTransform, @Nonnull final Estimator estimator, - @Nonnull final NodeReferenceAndNode selectedNeighbor, final int layer, + @Nonnull final NodeReferenceWithVector nodeReferenceWithVector, final int mMax, @Nonnull final NeighborsChangeSet neighborChangeSet, @Nonnull final Map> nodeCache) { - final AbstractNode selectedNeighborNode = selectedNeighbor.getNode(); final int numNeighbors = Iterables.size(neighborChangeSet.merge()); // this is a view over the iterable neighbors in the set if (numNeighbors < mMax) { @@ -1233,25 +1254,23 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) } else { if (logger.isTraceEnabled()) { logger.trace("pruning neighborhood of key={} which has numNeighbors={} out of mMax={}", - selectedNeighborNode.getPrimaryKey(), numNeighbors, mMax); + nodeReferenceWithVector.getPrimaryKey(), numNeighbors, mMax); } return fetchNeighborhoodReferences(storageAdapter, transaction, storageTransform, layer, neighborChangeSet.merge(), nodeCache) - .thenApply(nodeReferenceWithVectors -> { + .thenApply(neighborReferenceWithVectors -> { final ImmutableList.Builder nodeReferencesWithDistancesBuilder = ImmutableList.builder(); - for (final NodeReferenceWithVector nodeReferenceWithVector : nodeReferenceWithVectors) { - final var vector = nodeReferenceWithVector.getVector(); - final double distance = - estimator.distance(vector, - selectedNeighbor.getNodeReference().getVector()); + for (final NodeReferenceWithVector neighborReferenceWithVector : neighborReferenceWithVectors) { + final var neighborVector = neighborReferenceWithVector.getVector(); + final double distance = estimator.distance(neighborVector, nodeReferenceWithVector.getVector()); nodeReferencesWithDistancesBuilder.add( - new NodeReferenceWithDistance(nodeReferenceWithVector.getPrimaryKey(), - vector, distance)); + new NodeReferenceWithDistance(neighborReferenceWithVector.getPrimaryKey(), + neighborVector, distance)); } return nodeReferencesWithDistancesBuilder.build(); }) .thenCompose(nodeReferencesAndNodes -> - selectNeighbors(storageAdapter, transaction, storageTransform, estimator, + selectCandidates(storageAdapter, transaction, storageTransform, estimator, nodeReferencesAndNodes, layer, mMax, nodeCache)); } @@ -1289,14 +1308,14 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) * each represented as a {@link NodeReferenceAndNode} */ private CompletableFuture>> - selectNeighbors(@Nonnull final StorageAdapter storageAdapter, - @Nonnull final ReadTransaction readTransaction, - @Nonnull final AffineOperator storageTransform, - @Nonnull final Estimator estimator, - @Nonnull final Iterable initialCandidates, - final int layer, - final int m, - @Nonnull final Map> nodeCache) { + selectCandidates(@Nonnull final StorageAdapter storageAdapter, + @Nonnull final ReadTransaction readTransaction, + @Nonnull final AffineOperator storageTransform, + @Nonnull final Estimator estimator, + @Nonnull final Iterable initialCandidates, + final int layer, + final int m, + @Nonnull final Map> nodeCache) { final Metric metric = getConfig().getMetric(); final List selected = Lists.newArrayListWithExpectedSize(m); @@ -1368,7 +1387,7 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) * @param estimator the estimator * @param storageTransform an affine transformation operator that is used to transform the fetched vector into the * storage space that is currently being used - * @param candidates an {@link Iterable} of initial candidate nodes, which have already been evaluated + * @param candidates an {@link Collection} of initial candidate nodes, which have already been evaluated * @param layer the graph layer from which to fetch nodes * @param isExtendCandidates a boolean flag; if {@code true}, the candidate set is extended with neighbors * @param nodeCache a cache mapping primary keys to {@link AbstractNode} objects to avoid redundant fetches @@ -1382,7 +1401,7 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) @Nonnull final ReadTransaction readTransaction, @Nonnull final AffineOperator storageTransform, @Nonnull final Estimator estimator, - @Nonnull final Iterable> candidates, + @Nonnull final Collection> candidates, final int layer, final boolean isExtendCandidates, @Nonnull final Map> nodeCache, @@ -1397,8 +1416,8 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) } if (isExtendCandidates) { - return neighbors(storageAdapter, readTransaction, storageTransform, candidates, - HopMode.INCLUSIVE, layer, nodeCache) + return neighborReferences(storageAdapter, readTransaction, storageTransform, null, candidates, + HopMode.INCLUSIVE, CandidateSamplingPredicate.tautology(), layer, nodeCache) .thenApply(hop2 -> { for (final NodeReferenceWithVector nodeReferenceWithVector : hop2) { final double distance = estimator.distance(nodeReferenceWithVector.getVector(), vector); @@ -1413,37 +1432,96 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) } /** - * Compute and if necessary fetch the neighbors of an iterable of initial nodes that is passed in. Hop is defined as the - * set of all nodes that are neighbors of the initial nodes. Note that the neighbor of an initial node might - * be another initial node. If that is the case the node is returned. If that is not desired by the caller, the - * caller needs to remove those nodes via a subtraction of the initial set. + * Compute and if necessary fetch the neighbor references (with vectors) and the neighboring nodes of an iterable + * of initial nodes that is passed in. Note that the neighbor of an initial node might be another initial node. + * If that is the case the node is returned. * * @param the type of the {@link NodeReference} * @param storageAdapter the {@link StorageAdapter} used to access node data from storage * @param readTransaction the active {@link ReadTransaction} for database access * @param storageTransform an affine transformation operator that is used to transform the fetched vector into the * storage space that is currently being used + * @param random the random to be used for sampling * @param initialNodeReferenceAndNodes an {@link Iterable} of initial candidate nodes, which have already been evaluated * @param layer the graph layer from which to fetch nodes * @param nodeCache a cache mapping primary keys to {@link AbstractNode} objects to avoid redundant fetches * - * @return a {@link CompletableFuture} which will complete with a set of {@link NodeReferenceWithDistance} + * @return a {@link CompletableFuture} which will complete with a list of fetched nodes */ - private CompletableFuture> + private CompletableFuture>> neighbors(@Nonnull final StorageAdapter storageAdapter, @Nonnull final ReadTransaction readTransaction, @Nonnull final AffineOperator storageTransform, - @Nonnull final Iterable> initialNodeReferenceAndNodes, + @Nonnull final SplittableRandom random, + @Nonnull final Collection> initialNodeReferenceAndNodes, @Nonnull final HopMode hopMode, + @Nonnull final CandidateSamplingPredicate samplingPredicate, final int layer, - @Nonnull - final Predicate samplingPredicate, @Nonnull final Map> nodeCache) { - final ImmutableSet.Builder resultBuilder = ImmutableSet.builder(); + return neighborReferences(storageAdapter, readTransaction, storageTransform, random, + initialNodeReferenceAndNodes, hopMode, samplingPredicate, layer, nodeCache) + .thenCompose(neighborsFirstDegree -> + fetchSomeNodesIfNotCached(storageAdapter, readTransaction, storageTransform, layer, + neighborsFirstDegree, nodeCache)); + } + + /** + * Compute and if necessary fetch the neighbor references (with vectors) of an iterable of initial nodes that is + * passed in. Note that the neighbor of an initial node might be another initial node. If that is the case the node + * is returned. + * + * @param the type of the {@link NodeReference} + * @param storageAdapter the {@link StorageAdapter} used to access node data from storage + * @param readTransaction the active {@link ReadTransaction} for database access + * @param storageTransform an affine transformation operator that is used to transform the fetched vector into the + * storage space that is currently being used + * @param random a {@link SplittableRandom} to be used for sampling + * @param initialNodeReferenceAndNodes an {@link Iterable} of initial candidate nodes, which have already been + * evaluated + * @param hopMode the {@link HopMode} we should use + * @param samplingPredicate a predicate that restricts the number of neighbors to be fetched + * @param layer the graph layer from which to fetch nodes + * @param nodeCache a cache mapping primary keys to {@link AbstractNode} objects to avoid redundant fetches + * + * @return a {@link CompletableFuture} which will complete with a list of {@link NodeReferenceWithVector} + */ + private CompletableFuture> + neighborReferences(@Nonnull final StorageAdapter storageAdapter, + @Nonnull final ReadTransaction readTransaction, + @Nonnull final AffineOperator storageTransform, + @Nullable final SplittableRandom random, + @Nonnull final Collection> initialNodeReferenceAndNodes, + @Nonnull final HopMode hopMode, + @Nonnull final CandidateSamplingPredicate samplingPredicate, + final int layer, + @Nonnull final Map> nodeCache) { + final Iterable toBeFetched = + Iterables.filter(resolveNeighborReferences(initialNodeReferenceAndNodes, hopMode), + nodeReference -> samplingPredicate.test(random, + initialNodeReferenceAndNodes.size(), nodeReference)); + return fetchNeighborhoodReferences(storageAdapter, readTransaction, storageTransform, layer, toBeFetched, + nodeCache); + } + + /** + * Compute the neighbors of an iterable of initial nodes that is passed in. Hop is defined as the + * set of all nodes that are neighbors of the initial nodes. Note that the neighbor of an initial node might + * be another initial node. If that is the case the node is returned. If that is not desired by the caller, the + * caller needs to remove those nodes via a subtraction of the initial set. + * + * @param the type of the {@link NodeReference} + * storage space that is currently being used + * @param initialNodeReferenceAndNodes an {@link Iterable} of initial candidate nodes, which have already been evaluated + * + * @return a {@link CompletableFuture} which will complete with a set of {@link NodeReferenceWithDistance} + */ + private Set + resolveNeighborReferences(@Nonnull final Iterable> initialNodeReferenceAndNodes, + @Nonnull final HopMode hopMode) { + final ImmutableSet.Builder resultBuilder = ImmutableSet.builder(); final ImmutableMap.Builder> initialNodesMapBuilder = ImmutableMap.builder(); for (final NodeReferenceAndNode nodeReferenceAndNode : initialNodeReferenceAndNodes) { - initialNodesMapBuilder.put(nodeReferenceAndNode.getNodeReference().getPrimaryKey(), - nodeReferenceAndNode); + initialNodesMapBuilder.put(nodeReferenceAndNode.getNode().getPrimaryKey(), nodeReferenceAndNode); if (hopMode == HopMode.INCLUSIVE) { resultBuilder.add(nodeReferenceAndNode.getNodeReference()); } @@ -1452,7 +1530,6 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) final Set nodeReferencesSeen = Sets.newHashSet(); - final ImmutableList.Builder toBeFetchedBuilder = ImmutableList.builder(); for (final NodeReferenceAndNode nodeReferenceAndNode : initialNodeReferenceAndNodes) { for (final N neighbor : nodeReferenceAndNode.getNode().getNeighbors()) { final Tuple neighborPrimaryKey = neighbor.getPrimaryKey(); @@ -1470,20 +1547,12 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) // This is a node that is currently not known to us. It is not an initial node. We need to fetch it, // and we need to mark it as seen so we won't consider it more than once. // - - // TODO use the sampling predicate to determine if we want to fetch this neighbor or not - toBeFetchedBuilder.add(neighbor); + resultBuilder.add(neighbor); nodeReferencesSeen.add(neighborPrimaryKey); } } } - final Iterable toBeFetched = toBeFetchedBuilder.build(); - - return fetchNeighborhoodReferences(storageAdapter, readTransaction, storageTransform, layer, toBeFetched, nodeCache) - .thenApply(withVectors -> { - resultBuilder.addAll(withVectors); - return resultBuilder.build(); - }); + return resultBuilder.build(); } /** @@ -1537,8 +1606,8 @@ private void writeLonelyNodeOnLayer(@Nonnull final Qua @Nonnull final Tuple primaryKey, @Nonnull final Transformed vector) { storageAdapter.writeNode(transaction, quantizer, - storageAdapter.getNodeFactory() - .create(primaryKey, vector, ImmutableList.of()), layer, + layer, storageAdapter.getNodeFactory() + .create(primaryKey, vector, ImmutableList.of()), new BaseNeighborsChangeSet<>(ImmutableList.of())); if (logger.isTraceEnabled()) { logger.trace("written lonely node at key={} on layer={}", primaryKey, layer); @@ -1647,7 +1716,8 @@ public CompletableFuture delete(@Nonnull final Transaction transaction, @N } /** - * Deletes a node from the HNSW graph across multiple layers, starting from a given top layer and entry point. + * Deletes a node from the HNSW graph across multiple layers, using a primary key and starting from a given top + * layer. * * @param transaction the transaction to use for database operations * @param storageTransform an affine transformation operator that is used to transform the fetched vector into the @@ -1665,6 +1735,7 @@ public CompletableFuture delete(@Nonnull final Transaction transaction, @N private CompletableFuture deleteFromLayers(@Nonnull final Transaction transaction, @Nonnull final AffineOperator storageTransform, @Nonnull final Quantizer quantizer, + @Nonnull final SplittableRandom random, @Nonnull final Tuple primaryKey, @Nonnull final NodeReferenceWithDistance nodeReference, final int topLayer) { @@ -1675,7 +1746,8 @@ private CompletableFuture deleteFromLayers(@Nonnull final Transaction tran return MoreAsyncUtil.forEach(() -> IntStream.rangeClosed(0, topLayer).iterator(), layer -> { final StorageAdapter storageAdapter = getStorageAdapterForLayer(layer); - return deleteFromLayer(storageAdapter, transaction, storageTransform, quantizer, layer, primaryKey); + return deleteFromLayer(storageAdapter, transaction, storageTransform, quantizer, random, layer, + primaryKey); }, getConfig().getMaxNumConcurrentNeighborhoodFetches(), executor).thenCompose(ignored -> AsyncUtil.DONE); @@ -1694,76 +1766,202 @@ private CompletableFuture deleteFromLayers(@Nonnull final Transaction tran * storage space that is currently being used * @param quantizer the quantizer for this insert * @param layer the layer number to insert the new node into - * @param primaryKey the primary key of the new node to be inserted + * @param toBeDeletedPrimaryKey the primary key of the new node to be inserted * * @return a {@code CompletableFuture} that completes with a {@code null} */ @Nonnull - private CompletableFuture + private CompletableFuture deleteFromLayer(@Nonnull final StorageAdapter storageAdapter, @Nonnull final Transaction transaction, @Nonnull final AffineOperator storageTransform, @Nonnull final Quantizer quantizer, + @Nonnull final SplittableRandom random, final int layer, - @Nonnull final Tuple primaryKey) { + @Nonnull final Tuple toBeDeletedPrimaryKey) { if (logger.isTraceEnabled()) { - logger.trace("begin delete key={} at layer={}", primaryKey, layer); + logger.trace("begin delete key={} at layer={}", toBeDeletedPrimaryKey, layer); } - final Map> nodeCache = Maps.newConcurrentMap(); final Estimator estimator = quantizer.estimator(); + final Map> nodeCache = Maps.newConcurrentMap(); + final Map> candidateChangeSetMap = + Maps.newConcurrentMap(); - return storageAdapter.fetchNode(transaction, storageTransform, layer, primaryKey) + return storageAdapter.fetchNode(transaction, storageTransform, layer, toBeDeletedPrimaryKey) .thenCompose(toBeDeletedNode -> { final NodeReferenceAndNode toBeDeletedNodeReferenceAndNode = - new NodeReferenceAndNode<>(new NodeReferenceWithVector(primaryKey, + new NodeReferenceAndNode<>(new NodeReferenceWithVector(toBeDeletedPrimaryKey, toBeDeletedNode.asCompactNode().getVector()), toBeDeletedNode); - return neighbors(storageAdapter, transaction, storageTransform, - ImmutableList.of(toBeDeletedNodeReferenceAndNode), HopMode.INCLUSIVE, layer, nodeCache) - .thenCompose(neighborsFirstDegree -> - fetchSomeNodesIfNotCached(storageAdapter, transaction, storageTransform, layer, - neighborsFirstDegree, nodeCache)) - .thenCompose(neighborsFirstDegree -> - neighbors(storageAdapter, transaction, storageTransform, neighborsFirstDegree, - HopMode.INCLUSIVE, layer, nodeCache)) - .thenCompose(neighborsSecondDegree -> - forEach(toBeDeletedNode.getNeighbors(), - neighborReference -> fetchNodeIfNotCached(storageAdapter, transaction, - storageTransform, layer, neighborReference, nodeCache) - .thenCompose(neighborNode -> { - final ImmutableSet.Builder candidatesBuilder = ImmutableSet.builder(); - for (final NodeReferenceWithVector nodeReferenceWithVector : neighborsSecondDegree) { - final double distance = - estimator.distance(nodeReferenceWithVector.getVector(), - storageAdapter.getVector(neighborReference, neighborNode)); - candidatesBuilder.add(new NodeReferenceWithDistance(nodeReferenceWithVector.getPrimaryKey(), - nodeReferenceWithVector.getVector(), distance)); + return neighbors(storageAdapter, transaction, storageTransform, random, + ImmutableList.of(toBeDeletedNodeReferenceAndNode), HopMode.INCLUSIVE, + CandidateSamplingPredicate.tautology(), layer, nodeCache) + .thenCompose(candidates -> + neighbors(storageAdapter, transaction, storageTransform, random, + candidates, HopMode.INCLUSIVE, + this::shouldSampleCandidate, layer, nodeCache)) + .thenApply(candidates -> { + final ImmutableList.Builder> filteredCandidatesBuilder = + ImmutableList.builder(); + for (final NodeReferenceAndNode neighbor : candidates) { + // filter out neighbors that happen to be the node we are trying to delete + if (!neighbor.getNodeReference().getPrimaryKey().equals(toBeDeletedPrimaryKey)) { + filteredCandidatesBuilder.add(neighbor); + } + } + return filteredCandidatesBuilder.build(); + }) + .thenCompose(candidates -> + forEach(toBeDeletedNode.getNeighbors(), // for each direct neighbor + neighborReference -> + prepareCandidatesAndRepairNeighbor(storageAdapter, transaction, + storageTransform, quantizer, layer, toBeDeletedPrimaryKey, + neighborReference, candidates, candidateChangeSetMap, + nodeCache), + getConfig().getMaxNumConcurrentNeighborhoodFetches(), executor) + .thenApply(ignored -> { + final ImmutableMap.Builder candidateReferencesMapBuilder = + ImmutableMap.builder(); + for (final NodeReferenceAndNode candidate : candidates) { + final var candidatePrimaryKey = candidate.getNodeReference().getPrimaryKey(); + if (candidateChangeSetMap.containsKey(candidatePrimaryKey)) { + candidateReferencesMapBuilder.put(candidatePrimaryKey, candidate.getNodeReference()); + } + } + return candidateReferencesMapBuilder.build(); + })) + .thenCompose(candidateReferencesMap -> { + final int currentMMax = + layer == 0 ? getConfig().getMMax0() : getConfig().getMMax(); + + return forEach(candidateChangeSetMap.entrySet(), // for each modified neighbor set + changeSetEntry -> { + final NodeReferenceWithVector candidateReference = + Objects.requireNonNull(candidateReferencesMap.get(changeSetEntry.getKey())); + final NeighborsChangeSet candidateChangeSet = changeSetEntry.getValue(); + return pruneNeighborsIfNecessary(storageAdapter, transaction, + storageTransform, estimator, layer, candidateReference, + currentMMax, candidateChangeSet, nodeCache) + .thenApply(nodeReferencesAndNodes -> { + if (nodeReferencesAndNodes == null) { + return candidateChangeSet; } - return repairOutNeighborNode(storageAdapter, transaction, - storageTransform, quantizer, candidatesBuilder.build(), layer, nodeCache); - }), - getConfig().getMaxNumConcurrentNeighborhoodFetches(), executor)) - .thenCompose(ignored -> AsyncUtil.DONE); + + final var prunedCandidateChangeSet = + resolveChangeSetFromNewNeighbors(candidateChangeSet, + nodeReferencesAndNodes); + candidateChangeSetMap.put(changeSetEntry.getKey(), prunedCandidateChangeSet); + return prunedCandidateChangeSet; + }); + }, + getConfig().getMaxNumConcurrentNeighborhoodFetches(), executor) + .thenApply(ignored -> candidateReferencesMap); + }) + .thenApply(candidateReferencesMap -> { + storageAdapter.deleteNode(transaction, layer, toBeDeletedPrimaryKey); + + for (final Map.Entry> changeSetEntry : candidateChangeSetMap.entrySet()) { + final AbstractNode candidateNode = + nodeFromCache(changeSetEntry.getKey(), nodeCache); + storageAdapter.writeNode(transaction, quantizer, + layer, candidateNode, changeSetEntry.getValue()); + } + + // + // Return the first item in the candidates reference map as a potential new + // entry node reference in order to avoid a costly search for a new global entry point. + // This reference may not exist in a sparse HNSW but that case should be exceedingly + // rare. + // + final Tuple firstPrimaryKey = + Iterables.getFirst(candidateReferencesMap.keySet(), null); + return firstPrimaryKey == null + ? null + : new EntryNodeReference(firstPrimaryKey, + Objects.requireNonNull(candidateReferencesMap.get(firstPrimaryKey)).getVector(), + layer); + }); }).thenApply(result -> { if (logger.isTraceEnabled()) { - logger.trace("end delete key={} at layer={}", primaryKey, layer); + logger.trace("end delete key={} at layer={}", toBeDeletedPrimaryKey, layer); } return result; }); } + private @Nonnull CompletableFuture + prepareCandidatesAndRepairNeighbor(@Nonnull final StorageAdapter storageAdapter, + @Nonnull final Transaction transaction, + @Nonnull final AffineOperator storageTransform, + @Nonnull final Quantizer quantizer, + final int layer, + @Nonnull final Tuple toBeDeletedPrimaryKey, + @Nonnull final N neighborReference, + @Nonnull final Collection> sampledCandidates, + @Nonnull final Map> neighborChangeSetMap, + @Nonnull final Map> nodeCache) { + final Estimator estimator = quantizer.estimator(); + + return fetchNodeIfNotCached(storageAdapter, transaction, + storageTransform, layer, neighborReference, nodeCache) + .thenCompose(neighborNode -> { + final ImmutableList.Builder candidatesReferencesBuilder = + ImmutableList.builder(); + final Transformed neighborVector = storageAdapter.getVector(neighborReference, neighborNode); + for (final NodeReferenceAndNode candidate : sampledCandidates) { + // do not add the candidate if that candidate is in fact the neighbor itself + if (!candidate.getNodeReference().getPrimaryKey().equals(neighborReference.getPrimaryKey())) { + final Transformed candidateVector = + candidate.getNodeReference().getVector(); + final double distance = + estimator.distance(candidateVector, + neighborVector); + candidatesReferencesBuilder.add(new NodeReferenceWithDistance(candidate.getNode().getPrimaryKey(), + candidateVector, distance)); + } + } + return repairInsForNeighborNode(storageAdapter, transaction, storageTransform, quantizer, + layer, toBeDeletedPrimaryKey, neighborReference, candidatesReferencesBuilder.build(), + neighborChangeSetMap, nodeCache); + }); + } + private CompletableFuture - repairOutNeighborNode(@Nonnull final StorageAdapter storageAdapter, - @Nonnull final Transaction transaction, - @Nonnull final AffineOperator storageTransform, - @Nonnull final Quantizer quantizer, - @Nonnull final Set candidates, - final int layer, - final Map> nodeCache) { + repairInsForNeighborNode(@Nonnull final StorageAdapter storageAdapter, + @Nonnull final Transaction transaction, + @Nonnull final AffineOperator storageTransform, + @Nonnull final Quantizer quantizer, + final int layer, + @Nonnull final Tuple toBeDeletedPrimaryKey, + @Nonnull final N neighborReference, + @Nonnull final Iterable candidates, + @Nonnull final Map> neighborChangeSetMap, + final Map> nodeCache) { final Estimator estimator = quantizer.estimator(); - return selectNeighbors(storageAdapter, transaction, storageTransform, estimator, candidates, - layer, getConfig().getM(), nodeCache); + return selectCandidates(storageAdapter, transaction, storageTransform, estimator, candidates, + layer, getConfig().getM(), nodeCache) + .thenCompose(selectedCandidates -> { + // create change sets for each selected neighbor and insert new node into them + for (final NodeReferenceAndNode selectedCandidate : selectedCandidates) { + neighborChangeSetMap.compute(selectedCandidate.getNode().getPrimaryKey(), + (ignored, oldChangeSet) -> { + final NeighborsChangeSet baseSet; + if (oldChangeSet == null) { + baseSet = + // delete the primary key of the record we are trying to delete + new DeleteNeighborsChangeSet<>( + new BaseNeighborsChangeSet<>(selectedCandidate.getNode().getNeighbors()), + ImmutableList.of(toBeDeletedPrimaryKey)); + } else { + baseSet = oldChangeSet; + } + // insert a reference to the neighbor + return new InsertNeighborsChangeSet<>(baseSet, ImmutableList.of(neighborReference)); + }); + } + return AsyncUtil.DONE; + }); } /** @@ -1844,6 +2042,15 @@ private int topLayer(@Nonnull final Tuple primaryKey) { return (int) Math.floor(-Math.log(u) * lambda); } + @SuppressWarnings("unused") + private boolean shouldSampleCandidate(@Nullable final SplittableRandom random, final int numberOfCandidates, NodeReference nodeReference) { + final double sampleRate = (double)getConfig().getM() / numberOfCandidates; + if (sampleRate >= 1) { + return true; + } + return Objects.requireNonNull(random).nextDouble() < sampleRate; + } + private boolean shouldSampleVector(@Nonnull final SplittableRandom random) { return random.nextDouble() < getConfig().getSampleVectorStatsProbability(); } @@ -1852,6 +2059,14 @@ private boolean shouldMaintainStats(@Nonnull final SplittableRandom random) { return random.nextDouble() < getConfig().getMaintainStatsProbability(); } + private static T randomElement(@Nonnull final SplittableRandom random, + @Nonnull final Collection collection) { + if (collection.isEmpty()) { + throw new IllegalArgumentException("collection is empty"); + } + return Iterables.get(collection, random.nextInt(collection.size())); + } + private static double splitMixDouble(final long x) { return (splitMixLong(x) >>> 11) * 0x1.0p-53; } @@ -1874,7 +2089,7 @@ private static List drain(@Nonnull Queue queue) { } /** - * Let {@code I} be the set of initial nodes for {@link #neighbors}. Let {@code H(I)} be the set of nodes that can be + * Let {@code I} be the set of initial nodes for {@link #neighborReferences}. Let {@code H(I)} be the set of nodes that can be * reached by traversing the neighbors of nodes in {@code I}. */ private enum HopMode { @@ -1892,6 +2107,16 @@ private enum HopMode { EXCLUSIVE_ALL; } + @FunctionalInterface + private interface CandidateSamplingPredicate { + @Nonnull + static CandidateSamplingPredicate tautology() { + return (random, size, nodeReference) -> true; + } + + boolean test(@Nullable SplittableRandom random, int size, NodeReference nodeReference); + } + private static class AccessInfoAndNodeExistence { @Nullable private final AccessInfo accessInfo; diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InliningStorageAdapter.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InliningStorageAdapter.java index 0f1516787d..157f391b86 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InliningStorageAdapter.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InliningStorageAdapter.java @@ -217,15 +217,15 @@ private NodeReferenceWithVector neighborFromTuples(@Nonnull final AffineOperator * * @param transaction the transaction context for the write operation; must not be null * @param quantizer the quantizer to use + * @param layer the layer index where the node and its neighbor changes should be written * @param node the node to be written, which is expected to be an * {@code InliningNode}; must not be null - * @param layer the layer index where the node and its neighbor changes should be written * @param neighborsChangeSet the set of changes to the node's neighbors to be * persisted; must not be null */ @Override public void writeNodeInternal(@Nonnull final Transaction transaction, @Nonnull final Quantizer quantizer, - @Nonnull final AbstractNode node, final int layer, + final int layer, @Nonnull final AbstractNode node, @Nonnull final NeighborsChangeSet neighborsChangeSet) { final InliningNode inliningNode = node.asInliningNode(); @@ -233,6 +233,14 @@ public void writeNodeInternal(@Nonnull final Transaction transaction, @Nonnull f getOnWriteListener().onNodeWritten(layer, node); } + @Override + protected void deleteNodeInternal(@Nonnull final Transaction transaction, final int layer, + @Nonnull final Tuple primaryKey) { + final byte[] key = getNodeKey(layer, primaryKey); + transaction.clear(Range.startsWith(key)); + getOnWriteListener().onNodeDeleted(layer, primaryKey); + } + /** * Constructs the raw database key for a node based on its layer and primary key. *

diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/OnWriteListener.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/OnWriteListener.java index aacc1ca8f2..398e7770c8 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/OnWriteListener.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/OnWriteListener.java @@ -44,6 +44,19 @@ default void onNodeWritten(final int layer, @Nonnull final Node + * This is a default method with an empty implementation, allowing implementing classes to override it only if they + * need to react to this event. + * @param layer the index of the layer where the node was deleted. + * @param primaryKey the {@link Tuple} used as key to identify the node that was deleted; guaranteed to be non-null. + */ + @SuppressWarnings("unused") + default void onNodeDeleted(final int layer, @Nonnull final Tuple primaryKey) { + // nothing + } + /** * Callback method invoked when a neighbor is written for a specific node. *

diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageAdapter.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageAdapter.java index de33e1c1be..4963b5382f 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageAdapter.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageAdapter.java @@ -149,18 +149,28 @@ CompletableFuture> fetchNode(@Nonnull ReadTransaction readTransa /** * Writes a node and its neighbor changes to the data store within a given transaction. *

- * This method is responsible for persisting the state of a {@link AbstractNode} and applying any modifications to its + * This method is responsible for persisting the state of a {@link AbstractNode} and applying any modifications to + * its * neighboring nodes as defined in the {@code NeighborsChangeSet}. The entire operation is performed atomically as * part of the provided {@link Transaction}. + * * @param transaction the non-null transaction context for this write operation. * @param quantizer the quantizer to use - * @param node the non-null node to be written to the data store. * @param layer the layer index where the node resides. + * @param node the non-null node to be written to the data store. * @param changeSet the non-null set of changes describing additions or removals of * neighbors for the given {@link AbstractNode}. */ - void writeNode(@Nonnull Transaction transaction, @Nonnull Quantizer quantizer, @Nonnull AbstractNode node, - int layer, @Nonnull NeighborsChangeSet changeSet); + void writeNode(@Nonnull Transaction transaction, @Nonnull Quantizer quantizer, int layer, + @Nonnull AbstractNode node, @Nonnull NeighborsChangeSet changeSet); + + /** + * Deletes a node from the database. + * @param transaction the transaction to use + * @param layer the layer the node should be removed from + * @param primaryKey the primary key of the node + */ + void deleteNode(@Nonnull Transaction transaction, int layer, @Nonnull Tuple primaryKey); /** * Scans a specified layer of the structure, returning an iterable sequence of nodes. @@ -327,6 +337,7 @@ static CompletableFuture> consumeSampledVectors(@Nonnull final byte[] key = keyValue.getKey(); final byte[] value = keyValue.getValue(); resultBuilder.add(aggregatedVectorFromRaw(prefixSubspace, key, value)); + // this is done to not lock the entire range we just read but jst the keys we did read transaction.addReadConflictKey(key); transaction.clear(key); onReadListener.onKeyValueRead(-1, key, value); diff --git a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java index 07538f8e20..5be34d608a 100644 --- a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java +++ b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java @@ -544,7 +544,7 @@ private void writeNode(@Nonnull final Transaction tran final NeighborsChangeSet insertChangeSet = new InsertNeighborsChangeSet<>(new BaseNeighborsChangeSet<>(ImmutableList.of()), node.getNeighbors()); - storageAdapter.writeNode(transaction, Quantizer.noOpQuantizer(Metric.EUCLIDEAN_METRIC), node, layer, + storageAdapter.writeNode(transaction, Quantizer.noOpQuantizer(Metric.EUCLIDEAN_METRIC), layer, node, insertChangeSet); } From 1a2ee97c302b433045b2fe7f6b5e6710d768d556 Mon Sep 17 00:00:00 2001 From: Normen Seemann Date: Fri, 5 Dec 2025 14:24:20 +0100 Subject: [PATCH 03/17] inserts work again after refactorings for updates-deletes codepath are completed --- .../async/hnsw/AbstractStorageAdapter.java | 15 ++ .../async/hnsw/CompactStorageAdapter.java | 1 + .../apple/foundationdb/async/hnsw/HNSW.java | 209 +++++++++--------- .../async/hnsw/InliningStorageAdapter.java | 4 +- .../async/hnsw/OnWriteListener.java | 11 + .../async/hnsw/StorageAdapter.java | 24 +- .../async/hnsw/StorageTransform.java | 7 + 7 files changed, 165 insertions(+), 106 deletions(-) diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/AbstractStorageAdapter.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/AbstractStorageAdapter.java index 3b1952373d..9f8296fa2f 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/AbstractStorageAdapter.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/AbstractStorageAdapter.java @@ -26,6 +26,7 @@ import com.apple.foundationdb.linear.Quantizer; import com.apple.foundationdb.subspace.Subspace; import com.apple.foundationdb.tuple.Tuple; +import com.google.common.base.Verify; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -98,6 +99,20 @@ public NodeFactory getNodeFactory() { return nodeFactory; } + @Override + public boolean isInliningStorageAdapter() { + final boolean isInliningStorageAdapter = getNodeFactory().getNodeKind() == NodeKind.INLINING; + Verify.verify(!isInliningStorageAdapter || this instanceof InliningStorageAdapter); + return isInliningStorageAdapter; + } + + @Nonnull + @Override + public InliningStorageAdapter asInliningStorageAdapter() { + Verify.verify(isInliningStorageAdapter()); + return (InliningStorageAdapter)this; + } + @Override @Nonnull public Subspace getSubspace() { diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/CompactStorageAdapter.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/CompactStorageAdapter.java index aa4012158d..debba0d16c 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/CompactStorageAdapter.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/CompactStorageAdapter.java @@ -265,6 +265,7 @@ protected void deleteNodeInternal(@Nonnull final Transaction transaction, final final byte[] key = getNodeKey(layer, primaryKey); transaction.clear(key); getOnWriteListener().onNodeDeleted(layer, primaryKey); + getOnWriteListener().onKeyDeleted(layer, key); } /** diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java index bb7a7a48a5..9b5e71aece 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java @@ -197,9 +197,9 @@ public OnReadListener getOnReadListener() { } @Nonnull - private AffineOperator storageTransform(@Nullable final AccessInfo accessInfo) { + private StorageTransform storageTransform(@Nullable final AccessInfo accessInfo) { if (accessInfo == null || !accessInfo.canUseRaBitQ()) { - return AffineOperator.identity(); + return StorageTransform.identity(); } return new StorageTransform(accessInfo.getRotatorSeed(), @@ -259,7 +259,7 @@ private Quantizer quantizer(@Nullable final AccessInfo accessInfo) { } final EntryNodeReference entryNodeReference = accessInfo.getEntryNodeReference(); - final AffineOperator storageTransform = storageTransform(accessInfo); + final StorageTransform storageTransform = storageTransform(accessInfo); final Transformed transformedQueryVector = storageTransform.transform(queryVector); final Quantizer quantizer = quantizer(accessInfo); final Estimator estimator = quantizer.estimator(); @@ -362,7 +362,7 @@ private Quantizer quantizer(@Nullable final AccessInfo accessInfo) { * @param storageAdapter the {@link StorageAdapter} for accessing the graph data * @param readTransaction the {@link ReadTransaction} to use for the search * @param estimator a distance estimator - * @param nodeReference the starting point for the search on this layer, which includes the node and its distance to + * @param nodeReferenceWithDistance the starting point for the search on this layer, which includes the node and its distance to * the query vector * @param layer the zero-based index of the layer to search within * @param queryVector the query vector for which to find the nearest neighbor @@ -374,15 +374,64 @@ private Quantizer quantizer(@Nullable final AccessInfo accessInfo) { private CompletableFuture greedySearchLayer(@Nonnull final StorageAdapter storageAdapter, @Nonnull final ReadTransaction readTransaction, - @Nonnull final AffineOperator storageTransform, + @Nonnull final StorageTransform storageTransform, @Nonnull final Estimator estimator, - @Nonnull final NodeReferenceWithDistance nodeReference, + @Nonnull final NodeReferenceWithDistance nodeReferenceWithDistance, final int layer, @Nonnull final Transformed queryVector) { - return searchLayer(storageAdapter, readTransaction, storageTransform, estimator, - ImmutableList.of(nodeReference), layer, 1, Maps.newConcurrentMap(), queryVector) - .thenApply(searchResult -> - Iterables.getOnlyElement(searchResult).getNodeReference()); + if (storageAdapter.isInliningStorageAdapter()) { + return greedySearchInliningLayer(storageAdapter.asInliningStorageAdapter(), readTransaction, + storageTransform, estimator, nodeReferenceWithDistance, layer, queryVector); + } else { + return searchLayer(storageAdapter, readTransaction, storageTransform, estimator, + ImmutableList.of(nodeReferenceWithDistance), layer, 1, Maps.newConcurrentMap(), queryVector) + .thenApply(searchResult -> + Iterables.getOnlyElement(searchResult).getNodeReference()); + } + } + + @Nonnull + private CompletableFuture greedySearchInliningLayer(@Nonnull final StorageAdapter storageAdapter, + @Nonnull final ReadTransaction readTransaction, + @Nonnull final StorageTransform storageTransform, + @Nonnull final Estimator estimator, + @Nonnull final NodeReferenceWithDistance nodeReferenceWithDistance, + final int layer, + @Nonnull final Transformed queryVector) { + final AtomicReference currentNodeReferenceAtomic = + new AtomicReference<>(nodeReferenceWithDistance); + + return AsyncUtil.whileTrue(() -> onReadListener.onAsyncRead( + storageAdapter.fetchNode(readTransaction, storageTransform, layer, currentNodeReferenceAtomic.get().getPrimaryKey())) + .thenApply(node -> { + if (node == null) { + throw new IllegalStateException("unable to fetch node"); + } + final InliningNode inliningNode = node.asInliningNode(); + final List neighbors = inliningNode.getNeighbors(); + + final NodeReferenceWithDistance currentNodeReference = currentNodeReferenceAtomic.get(); + double minDistance = currentNodeReference.getDistance(); + + NodeReferenceWithVector nearestNeighbor = null; + for (final NodeReferenceWithVector neighbor : neighbors) { + final double distance = + estimator.distance(neighbor.getVector(), queryVector); + if (distance < minDistance) { + minDistance = distance; + nearestNeighbor = neighbor; + } + } + + if (nearestNeighbor == null) { + return false; + } + + currentNodeReferenceAtomic.set( + new NodeReferenceWithDistance(nearestNeighbor.getPrimaryKey(), nearestNeighbor.getVector(), + minDistance)); + return true; + }), executor).thenApply(ignored -> currentNodeReferenceAtomic.get()); } /** @@ -793,7 +842,7 @@ public CompletableFuture insert(@Nonnull final Transaction transaction, @N } final AccessInfo accessInfo = accessInfoAndNodeExistence.getAccessInfo(); - final AffineOperator storageTransform = storageTransform(accessInfo); + final StorageTransform storageTransform = storageTransform(accessInfo); final Transformed transformedNewVector = storageTransform.transform(newVector); final Quantizer quantizer = quantizer(accessInfo); final Estimator estimator = quantizer.estimator(); @@ -944,8 +993,8 @@ private CompletableFuture addToStatsIfNecessary(@Nonnull final SplittableR final AccessInfo newAccessInfo = new AccessInfo(currentAccessInfo.getEntryNodeReference().withVector(transformedEntryNodeVector), rotatorSeed, rotatedCentroid); - StorageAdapter.writeAccessInfo(transaction, getSubspace(), newAccessInfo, onWriteListener); - StorageAdapter.removeAllSampledVectors(transaction, getSubspace()); + StorageAdapter.writeAccessInfo(transaction, getSubspace(), newAccessInfo, getOnWriteListener()); + StorageAdapter.deleteAllSampledVectors(transaction, getSubspace(), getOnWriteListener()); if (logger.isTraceEnabled()) { logger.trace("established rotatorSeed={}, centroid with count={}, centroid={}", rotatorSeed, partialCount, rotatedCentroid); @@ -1406,14 +1455,7 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) final boolean isExtendCandidates, @Nonnull final Map> nodeCache, @Nonnull final Transformed vector) { - - // - // Add all given candidates to the result. - // final ImmutableList.Builder resultBuilder = ImmutableList.builder(); - for (final NodeReferenceAndNode candidate : candidates) { - resultBuilder.add(candidate.getNodeReference()); - } if (isExtendCandidates) { return neighborReferences(storageAdapter, readTransaction, storageTransform, null, candidates, @@ -1427,6 +1469,13 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) return resultBuilder.build(); }); } else { + // + // Add all given candidates to the result. + // + for (final NodeReferenceAndNode candidate : candidates) { + resultBuilder.add(candidate.getNodeReference()); + } + return CompletableFuture.completedFuture(resultBuilder.build()); } } @@ -1629,6 +1678,7 @@ private void writeLonelyNodeOnLayer(@Nonnull final Qua */ @Nonnull public CompletableFuture delete(@Nonnull final Transaction transaction, @Nonnull final Tuple primaryKey) { + final SplittableRandom random = random(primaryKey); final int topLayer = topLayer(primaryKey); if (logger.isTraceEnabled()) { logger.trace("new node with key={} selected to be deleted form layer={}", primaryKey, topLayer); @@ -1651,68 +1701,32 @@ public CompletableFuture delete(@Nonnull final Transaction transaction, @N } final AccessInfo accessInfo = accessInfoAndNodeExistence.getAccessInfo(); - final AffineOperator storageTransform = storageTransform(accessInfo); - final Transformed transformedNewVector = storageTransform.transform(newVector); + final EntryNodeReference entryNodeReference = + accessInfo == null ? null : accessInfo.getEntryNodeReference(); + final StorageTransform storageTransform = storageTransform(accessInfo); final Quantizer quantizer = quantizer(accessInfo); - final Estimator estimator = quantizer.estimator(); - final AccessInfo currentAccessInfo; - if (accessInfo == null) { - // this is the first node - writeLonelyNodes(quantizer, transaction, primaryKey, transformedNewVector, - topLayer, -1); - currentAccessInfo = new AccessInfo( - new EntryNodeReference(primaryKey, transformedNewVector, topLayer), - -1L, null); - StorageAdapter.writeAccessInfo(transaction, getSubspace(), currentAccessInfo, - getOnWriteListener()); - if (logger.isTraceEnabled()) { - logger.trace("written initial entry node reference with key={} on layer={}", - primaryKey, topLayer); - } - return AsyncUtil.DONE; - } else { - final EntryNodeReference entryNodeReference = accessInfo.getEntryNodeReference(); - final int lMax = entryNodeReference.getLayer(); - if (topLayer > lMax) { - writeLonelyNodes(quantizer, transaction, primaryKey, transformedNewVector, - topLayer, lMax); - currentAccessInfo = accessInfo.withNewEntryNodeReference( - new EntryNodeReference(primaryKey, transformedNewVector, - topLayer)); - StorageAdapter.writeAccessInfo(transaction, getSubspace(), currentAccessInfo, - getOnWriteListener()); - if (logger.isTraceEnabled()) { - logger.trace("written higher entry node reference with key={} on layer={}", - primaryKey, topLayer); - } - } else { - currentAccessInfo = accessInfo; - } - } - - final EntryNodeReference entryNodeReference = accessInfo.getEntryNodeReference(); - final int lMax = entryNodeReference.getLayer(); - if (logger.isTraceEnabled()) { - logger.trace("entry node read with key {} at layer {}", entryNodeReference.getPrimaryKey(), lMax); - } + return deleteFromLayers(transaction, storageTransform, quantizer, random, primaryKey, topLayer) + .thenCompose(potentialEntryNodeReferences -> { + if (entryNodeReference != null && primaryKey.equals(entryNodeReference.getPrimaryKey())) { + // find (and store) a new entry reference + for (int i = potentialEntryNodeReferences.size() - 1; i >= 0; i --) { + final EntryNodeReference potentialEntyNodeReference = + potentialEntryNodeReferences.get(i); + if (potentialEntyNodeReference != null) { + StorageAdapter.writeAccessInfo(transaction, getSubspace(), + accessInfo.withNewEntryNodeReference(potentialEntyNodeReference), getOnWriteListener()); + // early out + return AsyncUtil.DONE; + } + } - final NodeReferenceWithDistance initialNodeReference = - new NodeReferenceWithDistance(entryNodeReference.getPrimaryKey(), - entryNodeReference.getVector(), - estimator.distance(transformedNewVector, entryNodeReference.getVector())); - return forLoop(lMax, initialNodeReference, - layer -> layer > topLayer, - layer -> layer - 1, - (layer, previousNodeReference) -> { - final StorageAdapter storageAdapter = getStorageAdapterForLayer(layer); - return greedySearchLayer(storageAdapter, transaction, storageTransform, - estimator, previousNodeReference, layer, transformedNewVector); - }, executor) - .thenCompose(nodeReference -> - insertIntoLayers(transaction, storageTransform, quantizer, primaryKey, - transformedNewVector, nodeReference, lMax, topLayer)); - }).thenCompose(ignored -> AsyncUtil.DONE); + // officially there is no data in the structure, delete access info to start new + StorageAdapter.deleteAccessInfo(transaction, getSubspace(), getOnWriteListener()); + } + return AsyncUtil.DONE; + }); + }); } /** @@ -1724,23 +1738,20 @@ public CompletableFuture delete(@Nonnull final Transaction transaction, @N * storage space that is currently being used * @param quantizer the quantizer to be used for this insert * @param primaryKey the primary key of the new node being inserted - * @param nodeReference the initial entry point for the search, typically the nearest neighbor found in the highest - * layer * @param topLayer the top layer for the node. * * @return a {@link CompletableFuture} that completes when the new node has been successfully inserted into all - * its designated layers + * its designated layers and contains an existing neighboring entry node reference on that layer. */ @Nonnull - private CompletableFuture deleteFromLayers(@Nonnull final Transaction transaction, - @Nonnull final AffineOperator storageTransform, - @Nonnull final Quantizer quantizer, - @Nonnull final SplittableRandom random, - @Nonnull final Tuple primaryKey, - @Nonnull final NodeReferenceWithDistance nodeReference, - final int topLayer) { + private CompletableFuture> deleteFromLayers(@Nonnull final Transaction transaction, + @Nonnull final AffineOperator storageTransform, + @Nonnull final Quantizer quantizer, + @Nonnull final SplittableRandom random, + @Nonnull final Tuple primaryKey, + final int topLayer) { if (logger.isTraceEnabled()) { - logger.trace("nearest entry point for deleteFromLayers at topLayer={} is at key={}", topLayer, nodeReference.getPrimaryKey()); + logger.trace("nearest entry point for deleteFromLayers at topLayer={} is at key={}", topLayer, primaryKey); } return MoreAsyncUtil.forEach(() -> IntStream.rangeClosed(0, topLayer).iterator(), @@ -1750,7 +1761,7 @@ private CompletableFuture deleteFromLayers(@Nonnull final Transaction tran primaryKey); }, getConfig().getMaxNumConcurrentNeighborhoodFetches(), - executor).thenCompose(ignored -> AsyncUtil.DONE); + executor); } /** @@ -1920,7 +1931,7 @@ private CompletableFuture deleteFromLayers(@Nonnull final Transaction tran candidateVector, distance)); } } - return repairInsForNeighborNode(storageAdapter, transaction, storageTransform, quantizer, + return repairInsForNeighborNode(storageAdapter, transaction, storageTransform, estimator, layer, toBeDeletedPrimaryKey, neighborReference, candidatesReferencesBuilder.build(), neighborChangeSetMap, nodeCache); }); @@ -1930,15 +1941,13 @@ private CompletableFuture deleteFromLayers(@Nonnull final Transaction tran repairInsForNeighborNode(@Nonnull final StorageAdapter storageAdapter, @Nonnull final Transaction transaction, @Nonnull final AffineOperator storageTransform, - @Nonnull final Quantizer quantizer, + @Nonnull final Estimator estimator, final int layer, @Nonnull final Tuple toBeDeletedPrimaryKey, @Nonnull final N neighborReference, @Nonnull final Iterable candidates, @Nonnull final Map> neighborChangeSetMap, final Map> nodeCache) { - final Estimator estimator = quantizer.estimator(); - return selectCandidates(storageAdapter, transaction, storageTransform, estimator, candidates, layer, getConfig().getM(), nodeCache) .thenCompose(selectedCandidates -> { @@ -2059,14 +2068,6 @@ private boolean shouldMaintainStats(@Nonnull final SplittableRandom random) { return random.nextDouble() < getConfig().getMaintainStatsProbability(); } - private static T randomElement(@Nonnull final SplittableRandom random, - @Nonnull final Collection collection) { - if (collection.isEmpty()) { - throw new IllegalArgumentException("collection is empty"); - } - return Iterables.get(collection, random.nextInt(collection.size())); - } - private static double splitMixDouble(final long x) { return (splitMixLong(x) >>> 11) * 0x1.0p-53; } @@ -2104,7 +2105,7 @@ private enum HopMode { /** * Return {@code H(I)}. */ - EXCLUSIVE_ALL; + EXCLUSIVE_ALL } @FunctionalInterface diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InliningStorageAdapter.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InliningStorageAdapter.java index 157f391b86..d84c9968b1 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InliningStorageAdapter.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InliningStorageAdapter.java @@ -237,8 +237,10 @@ public void writeNodeInternal(@Nonnull final Transaction transaction, @Nonnull f protected void deleteNodeInternal(@Nonnull final Transaction transaction, final int layer, @Nonnull final Tuple primaryKey) { final byte[] key = getNodeKey(layer, primaryKey); - transaction.clear(Range.startsWith(key)); + final Range range = Range.startsWith(key); + transaction.clear(range); getOnWriteListener().onNodeDeleted(layer, primaryKey); + getOnWriteListener().onRangeDeleted(layer, range); } /** diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/OnWriteListener.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/OnWriteListener.java index 398e7770c8..3ec3b5a75e 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/OnWriteListener.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/OnWriteListener.java @@ -20,6 +20,7 @@ package com.apple.foundationdb.async.hnsw; +import com.apple.foundationdb.Range; import com.apple.foundationdb.tuple.Tuple; import javax.annotation.Nonnull; @@ -96,4 +97,14 @@ default void onNeighborDeleted(final int layer, @Nonnull final Node { @Nonnull NodeFactory getNodeFactory(); + boolean isInliningStorageAdapter(); + + @Nonnull + InliningStorageAdapter asInliningStorageAdapter(); + /** * Get the subspace used to store this HNSW structure. * @return the subspace @@ -319,6 +324,21 @@ static void writeAccessInfo(@Nonnull final Transaction transaction, onWriteListener.onKeyValueWritten(entryNodeReference.getLayer(), key, value); } + /** + * Deletes the {@link AccessInfo} from the database within a given transaction and subspace. + * @param transaction the database transaction to use for the write operation + * @param subspace the subspace where the entry node reference will be stored + * @param onWriteListener the listener to be notified after the key-value pair is written + */ + static void deleteAccessInfo(@Nonnull final Transaction transaction, + @Nonnull final Subspace subspace, + @Nonnull final OnWriteListener onWriteListener) { + final Subspace entryNodeSubspace = accessInfoSubspace(subspace); + final byte[] key = entryNodeSubspace.pack(); + transaction.clear(key); + onWriteListener.onKeyDeleted(-1, key); + } + @Nonnull static CompletableFuture> consumeSampledVectors(@Nonnull final Transaction transaction, @Nonnull final Subspace subspace, @@ -360,12 +380,14 @@ static void appendSampledVector(@Nonnull final Transaction transaction, onWriteListener.onKeyValueWritten(-1, prefixKey, value); } - static void removeAllSampledVectors(@Nonnull final Transaction transaction, @Nonnull final Subspace subspace) { + static void deleteAllSampledVectors(@Nonnull final Transaction transaction, @Nonnull final Subspace subspace, + @Nonnull final OnWriteListener onWriteListener) { final Subspace prefixSubspace = samplesSubspace(subspace); final byte[] prefixKey = prefixSubspace.pack(); final Range range = Range.startsWith(prefixKey); transaction.clear(range); + onWriteListener.onRangeDeleted(-1, range); } @Nonnull diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageTransform.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageTransform.java index 3cac6f4826..e286fdc0d2 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageTransform.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageTransform.java @@ -35,6 +35,8 @@ * system of the client and the coordinate system that is currently employed in the HNSW. */ class StorageTransform extends AffineOperator { + private static final StorageTransform IDENTITY_STORAGE_TRANSFORM = new StorageTransform(null, null); + public StorageTransform(final long seed, final int numDimensions, @Nonnull final RealVector translationVector) { this(new FhtKacRotator(seed, numDimensions, 10), translationVector); @@ -67,4 +69,9 @@ public RealVector apply(@Nonnull final RealVector vector) { public RealVector invertedApply(@Nonnull final RealVector vector) { return super.invertedApply(vector); } + + @Nonnull + public static StorageTransform identity() { + return IDENTITY_STORAGE_TRANSFORM; + } } From 1cd881a61807e323834bbd7bca4130f45edb0c11 Mon Sep 17 00:00:00 2001 From: Normen Seemann Date: Fri, 5 Dec 2025 18:40:29 +0100 Subject: [PATCH 04/17] inserts work again after refactorings for updates-deletes codepath are completed --- .../apple/foundationdb/async/hnsw/HNSW.java | 302 ++++++++++-------- .../foundationdb/async/hnsw/HNSWTest.java | 4 +- 2 files changed, 165 insertions(+), 141 deletions(-) diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java index 9b5e71aece..21cf3f24a9 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java @@ -379,60 +379,64 @@ private Quantizer quantizer(@Nullable final AccessInfo accessInfo) { @Nonnull final NodeReferenceWithDistance nodeReferenceWithDistance, final int layer, @Nonnull final Transformed queryVector) { - if (storageAdapter.isInliningStorageAdapter()) { - return greedySearchInliningLayer(storageAdapter.asInliningStorageAdapter(), readTransaction, - storageTransform, estimator, nodeReferenceWithDistance, layer, queryVector); - } else { - return searchLayer(storageAdapter, readTransaction, storageTransform, estimator, - ImmutableList.of(nodeReferenceWithDistance), layer, 1, Maps.newConcurrentMap(), queryVector) - .thenApply(searchResult -> - Iterables.getOnlyElement(searchResult).getNodeReference()); - } +// if (storageAdapter.isInliningStorageAdapter()) { +// return greedySearchInliningLayer(storageAdapter.asInliningStorageAdapter(), readTransaction, +// storageTransform, estimator, nodeReferenceWithDistance, layer, queryVector); +// } else { + return searchLayer(storageAdapter, readTransaction, storageTransform, estimator, + ImmutableList.of(nodeReferenceWithDistance), layer, 1, Maps.newConcurrentMap(), queryVector) + .thenApply(searchResult -> + Iterables.getOnlyElement(searchResult).getNodeReference()); +// } } - @Nonnull - private CompletableFuture greedySearchInliningLayer(@Nonnull final StorageAdapter storageAdapter, - @Nonnull final ReadTransaction readTransaction, - @Nonnull final StorageTransform storageTransform, - @Nonnull final Estimator estimator, - @Nonnull final NodeReferenceWithDistance nodeReferenceWithDistance, - final int layer, - @Nonnull final Transformed queryVector) { - final AtomicReference currentNodeReferenceAtomic = - new AtomicReference<>(nodeReferenceWithDistance); - - return AsyncUtil.whileTrue(() -> onReadListener.onAsyncRead( - storageAdapter.fetchNode(readTransaction, storageTransform, layer, currentNodeReferenceAtomic.get().getPrimaryKey())) - .thenApply(node -> { - if (node == null) { - throw new IllegalStateException("unable to fetch node"); - } - final InliningNode inliningNode = node.asInliningNode(); - final List neighbors = inliningNode.getNeighbors(); - - final NodeReferenceWithDistance currentNodeReference = currentNodeReferenceAtomic.get(); - double minDistance = currentNodeReference.getDistance(); - - NodeReferenceWithVector nearestNeighbor = null; - for (final NodeReferenceWithVector neighbor : neighbors) { - final double distance = - estimator.distance(neighbor.getVector(), queryVector); - if (distance < minDistance) { - minDistance = distance; - nearestNeighbor = neighbor; - } - } - - if (nearestNeighbor == null) { - return false; - } - - currentNodeReferenceAtomic.set( - new NodeReferenceWithDistance(nearestNeighbor.getPrimaryKey(), nearestNeighbor.getVector(), - minDistance)); - return true; - }), executor).thenApply(ignored -> currentNodeReferenceAtomic.get()); - } +// @Nonnull +// private CompletableFuture greedySearchInliningLayer(@Nonnull final InliningStorageAdapter storageAdapter, +// @Nonnull final ReadTransaction readTransaction, +// @Nonnull final StorageTransform storageTransform, +// @Nonnull final Estimator estimator, +// @Nonnull final NodeReferenceWithDistance nodeReferenceWithDistance, +// final int layer, +// @Nonnull final Transformed queryVector) { +// final AtomicReference currentNodeReferenceAtomic = +// new AtomicReference<>(nodeReferenceWithDistance); +// +// return AsyncUtil.whileTrue(() -> onReadListener.onAsyncRead( +// storageAdapter.fetchNode(readTransaction, storageTransform, layer, currentNodeReferenceAtomic.get().getPrimaryKey())) +// .thenApply(node -> { +// if (node == null) { +// // +// // This cannot happen under normal circumstances as the storage adapter returns a node with no +// // neighbors if it already has been deleted. Therefore, it is correct to throw here. +// // +// throw new IllegalStateException("unable to fetch node"); +// } +// final InliningNode inliningNode = node.asInliningNode(); +// final List neighbors = inliningNode.getNeighbors(); +// +// final NodeReferenceWithDistance currentNodeReference = currentNodeReferenceAtomic.get(); +// double minDistance = currentNodeReference.getDistance(); +// +// NodeReferenceWithVector nearestNeighbor = null; +// for (final NodeReferenceWithVector neighbor : neighbors) { +// final double distance = +// estimator.distance(neighbor.getVector(), queryVector); +// if (distance < minDistance) { +// minDistance = distance; +// nearestNeighbor = neighbor; +// } +// } +// +// if (nearestNeighbor == null) { +// return false; +// } +// +// currentNodeReferenceAtomic.set( +// new NodeReferenceWithDistance(nearestNeighbor.getPrimaryKey(), nearestNeighbor.getVector(), +// minDistance)); +// return true; +// }), executor).thenApply(ignored -> currentNodeReferenceAtomic.get()); +// } /** * Searches a single layer of the graph to find the nearest neighbors to a query vector. @@ -502,7 +506,9 @@ private CompletableFuture greedySearchInliningLayer(@ return fetchNodeIfNotCached(storageAdapter, readTransaction, storageTransform, layer, candidate, nodeCache) .thenApply(candidateNode -> - Iterables.filter(candidateNode.getNeighbors(), + candidateNode == null + ? ImmutableList.of() + : Iterables.filter(candidateNode.getNeighbors(), neighbor -> !visited.contains(Objects.requireNonNull(neighbor).getPrimaryKey()))) .thenCompose(neighborReferences -> fetchNeighborhoodReferences(storageAdapter, readTransaction, storageTransform, layer, neighborReferences, nodeCache)) @@ -599,7 +605,10 @@ private CompletableFuture greedySearchInliningLayer(@ return fetchNodeIfNecessaryAndApply(storageAdapter, readTransaction, storageTransform, layer, nodeReference, nR -> nodeCache.get(nR.getPrimaryKey()), (nR, node) -> { - nodeCache.put(nR.getPrimaryKey(), node); + // TODO maybe use a placeholder instance for null so we won't try multiple times + if (node != null) { + nodeCache.put(nR.getPrimaryKey(), node); + } return node; }); } @@ -651,7 +660,7 @@ private CompletableFuture greedySearchInliningLayer(@ return onReadListener.onAsyncRead( storageAdapter.fetchNode(readTransaction, storageTransform, layer, nodeReference.getPrimaryKey())) - .thenApply(node -> biMapFunction.apply(nodeReference, Objects.requireNonNull(node))); + .thenApply(node -> biMapFunction.apply(nodeReference, node)); } /** @@ -697,14 +706,17 @@ private CompletableFuture greedySearchInliningLayer(@ neighborNode.asCompactNode().getVector()); }, (neighborReference, neighborNode) -> { - // - // At this point we know that the node needed to be fetched, which means this branch cannot be - // reached for INLINING nodes as they never have to be fetched. Therefore, we can safely treat - // the nodes as compact nodes. - // - nodeCache.put(neighborReference.getPrimaryKey(), neighborNode); - return new NodeReferenceWithVector(neighborReference.getPrimaryKey(), - neighborNode.asCompactNode().getVector()); + if (neighborNode != null) { + // + // At this point we know that the node needed to be fetched, which means this branch cannot be + // reached for INLINING nodes as they never have to be fetched. Therefore, we can safely treat + // the nodes as compact nodes. + // + nodeCache.put(neighborReference.getPrimaryKey(), neighborNode); + return new NodeReferenceWithVector(neighborReference.getPrimaryKey(), + neighborNode.asCompactNode().getVector()); + } + return null; }); } @@ -749,8 +761,11 @@ private CompletableFuture greedySearchInliningLayer(@ return new NodeReferenceAndNode<>(nodeReference, node); }, (nodeReference, node) -> { - nodeCache.put(nodeReference.getPrimaryKey(), node); - return new NodeReferenceAndNode<>(nodeReference, node); + if (node != null) { + nodeCache.put(nodeReference.getPrimaryKey(), node); + return new NodeReferenceAndNode<>(nodeReference, node); + } + return null; }); } @@ -778,8 +793,8 @@ private CompletableFuture greedySearchInliningLayer(@ * @param biMapFunction The function to apply when a node is successfully fetched, mapping the original * reference and the fetched {@link AbstractNode} to a result of type {@code U}. * - * @return A {@link CompletableFuture} that, upon completion, will hold a {@link java.util.List} of results - * of type {@code U}, corresponding to each processed node reference. + * @return A {@link CompletableFuture} that, upon completion, will hold a {@link java.util.List} of non-null results + * of type {@code U} */ @Nonnull private CompletableFuture> @@ -794,7 +809,16 @@ private CompletableFuture greedySearchInliningLayer(@ currentNeighborReference -> fetchNodeIfNecessaryAndApply(storageAdapter, readTransaction, storageTransform, layer, currentNeighborReference, fetchBypassFunction, biMapFunction), getConfig().getMaxNumConcurrentNodeFetches(), - getExecutor()); + getExecutor()) + .thenApply(results -> { + final ImmutableList.Builder filteredListBuilder = ImmutableList.builder(); + for (final U result : results) { + if (result != null) { + filteredListBuilder.add(result); + } + } + return filteredListBuilder.build(); + }); } /** @@ -1064,7 +1088,8 @@ private CompletableFuture insertIntoLayers(@Nonnull final Transaction tran (layer, previousNodeReferences) -> { final StorageAdapter storageAdapter = getStorageAdapterForLayer(layer); return insertIntoLayer(storageAdapter, transaction, storageTransform, quantizer, - previousNodeReferences, layer, newPrimaryKey, newVector); + previousNodeReferences, layer, newPrimaryKey, newVector) + .thenApply(NodeReferenceAndNode::getReferences); }, executor).thenCompose(ignored -> AsyncUtil.DONE); } @@ -1101,11 +1126,11 @@ private CompletableFuture insertIntoLayers(@Nonnull final Transaction tran * @param newVector the vector associated with the new node * * @return a {@code CompletableFuture} that completes with a list of the nearest neighbors found during the - * initial search phase. This list serves as the entry point for insertion into the next lower layer - * (i.e., {@code layer - 1}). + * initial search phase. This list serves as the entry point for insertion into the next lower layer + * (i.e., {@code layer - 1}). */ @Nonnull - private CompletableFuture> + private CompletableFuture>> insertIntoLayer(@Nonnull final StorageAdapter storageAdapter, @Nonnull final Transaction transaction, @Nonnull final AffineOperator storageTransform, @@ -1122,75 +1147,72 @@ private CompletableFuture insertIntoLayers(@Nonnull final Transaction tran return searchLayer(storageAdapter, transaction, storageTransform, estimator, nearestNeighbors, layer, config.getEfConstruction(), nodeCache, newVector) - .thenCompose(searchResult -> { - final List references = NodeReferenceAndNode.getReferences(searchResult); - - return extendCandidatesIfNecessary(storageAdapter, transaction, storageTransform, estimator, - searchResult, layer, getConfig().isExtendCandidates(), nodeCache, newVector) - .thenCompose(extendedCandidates -> - selectCandidates(storageAdapter, transaction, storageTransform, estimator, - extendedCandidates, layer, getConfig().getM(), nodeCache)) - .thenCompose(selectedNeighbors -> { - final NodeFactory nodeFactory = storageAdapter.getNodeFactory(); - - final AbstractNode newNode = - nodeFactory.create(newPrimaryKey, newVector, - NodeReferenceAndNode.getReferences(selectedNeighbors)); - - final NeighborsChangeSet newNodeChangeSet = - new InsertNeighborsChangeSet<>( - new BaseNeighborsChangeSet<>(ImmutableList.of()), - newNode.getNeighbors()); - - storageAdapter.writeNode(transaction, quantizer, layer, newNode, - newNodeChangeSet); - - // create change sets for each selected neighbor and insert new node into them - final Map> neighborChangeSetMap = - Maps.newLinkedHashMap(); - for (final NodeReferenceAndNode selectedNeighbor : selectedNeighbors) { - final NeighborsChangeSet baseSet = - new BaseNeighborsChangeSet<>( - selectedNeighbor.getNode().getNeighbors()); - final NeighborsChangeSet insertSet = - new InsertNeighborsChangeSet<>(baseSet, - ImmutableList.of(newNode.getSelfReference(newVector))); - neighborChangeSetMap.put(selectedNeighbor.getNode().getPrimaryKey(), - insertSet); - } - - final int currentMMax = - layer == 0 ? getConfig().getMMax0() : getConfig().getMMax(); + .thenCompose(searchResult -> + extendCandidatesIfNecessary(storageAdapter, transaction, storageTransform, estimator, + searchResult, layer, getConfig().isExtendCandidates(), nodeCache, newVector) + .thenCompose(extendedCandidates -> + selectCandidates(storageAdapter, transaction, storageTransform, estimator, + extendedCandidates, layer, getConfig().getM(), nodeCache)) + .thenCompose(selectedNeighbors -> { + final NodeFactory nodeFactory = storageAdapter.getNodeFactory(); + + final AbstractNode newNode = + nodeFactory.create(newPrimaryKey, newVector, + NodeReferenceAndNode.getReferences(selectedNeighbors)); + + final NeighborsChangeSet newNodeChangeSet = + new InsertNeighborsChangeSet<>( + new BaseNeighborsChangeSet<>(ImmutableList.of()), + newNode.getNeighbors()); + + storageAdapter.writeNode(transaction, quantizer, layer, newNode, + newNodeChangeSet); + + // create change sets for each selected neighbor and insert new node into them + final Map> neighborChangeSetMap = + Maps.newLinkedHashMap(); + for (final NodeReferenceAndNode selectedNeighbor : selectedNeighbors) { + final NeighborsChangeSet baseSet = + new BaseNeighborsChangeSet<>( + selectedNeighbor.getNode().getNeighbors()); + final NeighborsChangeSet insertSet = + new InsertNeighborsChangeSet<>(baseSet, + ImmutableList.of(newNode.getSelfReference(newVector))); + neighborChangeSetMap.put(selectedNeighbor.getNode().getPrimaryKey(), + insertSet); + } - return forEach(selectedNeighbors, - selectedNeighbor -> { - final NodeReferenceWithDistance selectedNeighborReference = - selectedNeighbor.getNodeReference(); - final AbstractNode selectedNeighborNode = selectedNeighbor.getNode(); - final NeighborsChangeSet changeSet = - Objects.requireNonNull(neighborChangeSetMap.get(selectedNeighborNode.getPrimaryKey())); - return pruneNeighborsIfNecessary(storageAdapter, transaction, - storageTransform, estimator, layer, selectedNeighborReference, - currentMMax, changeSet, nodeCache) - .thenApply(nodeReferencesAndNodes -> { - if (nodeReferencesAndNodes == null) { - return changeSet; - } - return resolveChangeSetFromNewNeighbors(changeSet, nodeReferencesAndNodes); - }); - }, getConfig().getMaxNumConcurrentNeighborhoodFetches(), getExecutor()) - .thenApply(changeSets -> { - for (int i = 0; i < selectedNeighbors.size(); i++) { - final NodeReferenceAndNode selectedNeighbor = - selectedNeighbors.get(i); - final NeighborsChangeSet changeSet = changeSets.get(i); - storageAdapter.writeNode(transaction, quantizer, - layer, selectedNeighbor.getNode(), changeSet); - } - return ImmutableList.copyOf(references); - }); - }); - }).thenApply(nodeReferencesWithDistances -> { + final int currentMMax = + layer == 0 ? getConfig().getMMax0() : getConfig().getMMax(); + + return forEach(selectedNeighbors, + selectedNeighbor -> { + final NodeReferenceWithDistance selectedNeighborReference = + selectedNeighbor.getNodeReference(); + final AbstractNode selectedNeighborNode = selectedNeighbor.getNode(); + final NeighborsChangeSet changeSet = + Objects.requireNonNull(neighborChangeSetMap.get(selectedNeighborNode.getPrimaryKey())); + return pruneNeighborsIfNecessary(storageAdapter, transaction, + storageTransform, estimator, layer, selectedNeighborReference, + currentMMax, changeSet, nodeCache) + .thenApply(nodeReferencesAndNodes -> { + if (nodeReferencesAndNodes == null) { + return changeSet; + } + return resolveChangeSetFromNewNeighbors(changeSet, nodeReferencesAndNodes); + }); + }, getConfig().getMaxNumConcurrentNeighborhoodFetches(), getExecutor()) + .thenApply(changeSets -> { + for (int i = 0; i < selectedNeighbors.size(); i++) { + final NodeReferenceAndNode selectedNeighbor = + selectedNeighbors.get(i); + final NeighborsChangeSet changeSet = changeSets.get(i); + storageAdapter.writeNode(transaction, quantizer, + layer, selectedNeighbor.getNode(), changeSet); + } + return ImmutableList.copyOf(searchResult); + }); + })).thenApply(nodeReferencesWithDistances -> { if (logger.isTraceEnabled()) { logger.trace("end insert key={} at layer={}", newPrimaryKey, layer); } diff --git a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java index 5be34d608a..1dcf15865e 100644 --- a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java +++ b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java @@ -302,6 +302,8 @@ void testBasicInsert(final long seed, final boolean useInlining, final boolean e hnsw.scanLayer(db, 1, 100, node -> Assertions.assertThat(readIds.add(node.getPrimaryKey().getLong(0))).isTrue()); Assertions.assertThat(readIds.size()).isBetween(10, 50); + + db.run(tr -> hnsw.delete(tr, Tuple.from(10L)).join()); } @ParameterizedTest() @@ -350,7 +352,7 @@ void testBasicInsertWithRaBitQEncodings(final long seed) { } // - // If we fetch the current state back from the db some vectors are regular vectors and some vectors are + // If we fetch the current state back from the db, some vectors are regular vectors and some vectors are // RaBitQ encoded. Since that information is not surfaced through the API, we need to scan layer 0, get // all vectors directly from disk (encoded/not-encoded, transformed/not-transformed) in order to check // that transformations/reconstructions are applied properly. From 93e28dc6b3013628a39dfbbf1c121be5e43795b3 Mon Sep 17 00:00:00 2001 From: Normen Seemann Date: Mon, 8 Dec 2025 12:05:01 +0100 Subject: [PATCH 05/17] deletes work somewhat --- .../apple/foundationdb/async/hnsw/HNSW.java | 91 +++++-- .../foundationdb/async/hnsw/HNSWTest.java | 255 +++++++++++++++++- 2 files changed, 324 insertions(+), 22 deletions(-) diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java index 21cf3f24a9..faea74edca 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java @@ -928,7 +928,7 @@ public CompletableFuture insert(@Nonnull final Transaction transaction, @N insertIntoLayers(transaction, storageTransform, quantizer, newPrimaryKey, transformedNewVector, nodeReference, lMax, insertionLayer)) .thenCompose(ignored -> - addToStatsIfNecessary(random.split(), transaction, currentAccessInfo, transformedNewVector)); + addToStatsIfNecessary(random, transaction, currentAccessInfo, transformedNewVector)); }).thenCompose(ignored -> AsyncUtil.DONE); } @@ -1212,7 +1212,8 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) } return ImmutableList.copyOf(searchResult); }); - })).thenApply(nodeReferencesWithDistances -> { + })) + .thenApply(nodeReferencesWithDistances -> { if (logger.isTraceEnabled()) { logger.trace("end insert key={} at layer={}", newPrimaryKey, layer); } @@ -1567,9 +1568,7 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) final int layer, @Nonnull final Map> nodeCache) { final Iterable toBeFetched = - Iterables.filter(resolveNeighborReferences(initialNodeReferenceAndNodes, hopMode), - nodeReference -> samplingPredicate.test(random, - initialNodeReferenceAndNodes.size(), nodeReference)); + resolveNeighborReferences(initialNodeReferenceAndNodes, random, hopMode, samplingPredicate); return fetchNeighborhoodReferences(storageAdapter, readTransaction, storageTransform, layer, toBeFetched, nodeCache); } @@ -1587,8 +1586,10 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) * @return a {@link CompletableFuture} which will complete with a set of {@link NodeReferenceWithDistance} */ private Set - resolveNeighborReferences(@Nonnull final Iterable> initialNodeReferenceAndNodes, - @Nonnull final HopMode hopMode) { + resolveNeighborReferences(@Nonnull final Collection> initialNodeReferenceAndNodes, + @Nullable final SplittableRandom random, + @Nonnull final HopMode hopMode, + @Nonnull final CandidateSamplingPredicate samplingPredicate) { final ImmutableSet.Builder resultBuilder = ImmutableSet.builder(); final ImmutableMap.Builder> initialNodesMapBuilder = ImmutableMap.builder(); for (final NodeReferenceAndNode nodeReferenceAndNode : initialNodeReferenceAndNodes) { @@ -1604,6 +1605,12 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) for (final NodeReferenceAndNode nodeReferenceAndNode : initialNodeReferenceAndNodes) { for (final N neighbor : nodeReferenceAndNode.getNode().getNeighbors()) { final Tuple neighborPrimaryKey = neighbor.getPrimaryKey(); + + if (!samplingPredicate.test(random, + initialNodeReferenceAndNodes.size(), neighbor)) { + continue; + } + @Nullable final NodeReferenceAndNode initialNode = initialNodesMap.get(neighborPrimaryKey); if (initialNode != null) { // @@ -1703,7 +1710,7 @@ public CompletableFuture delete(@Nonnull final Transaction transaction, @N final SplittableRandom random = random(primaryKey); final int topLayer = topLayer(primaryKey); if (logger.isTraceEnabled()) { - logger.trace("new node with key={} selected to be deleted form layer={}", primaryKey, topLayer); + logger.trace("new node with key={} to be deleted form layer={}", primaryKey, topLayer); } return StorageAdapter.fetchAccessInfo(getConfig(), transaction, getSubspace(), getOnReadListener()) @@ -1772,15 +1779,15 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi @Nonnull final SplittableRandom random, @Nonnull final Tuple primaryKey, final int topLayer) { - if (logger.isTraceEnabled()) { - logger.trace("nearest entry point for deleteFromLayers at topLayer={} is at key={}", topLayer, primaryKey); + if (logger.isDebugEnabled()) { + logger.debug("nearest entry point for deleteFromLayers at topLayer={} is at key={}", topLayer, primaryKey); } return MoreAsyncUtil.forEach(() -> IntStream.rangeClosed(0, topLayer).iterator(), layer -> { final StorageAdapter storageAdapter = getStorageAdapterForLayer(layer); - return deleteFromLayer(storageAdapter, transaction, storageTransform, quantizer, random, layer, - primaryKey); + return deleteFromLayer(storageAdapter, transaction, storageTransform, quantizer, random.split(), + layer, primaryKey); }, getConfig().getMaxNumConcurrentNeighborhoodFetches(), executor); @@ -1812,8 +1819,8 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi @Nonnull final SplittableRandom random, final int layer, @Nonnull final Tuple toBeDeletedPrimaryKey) { - if (logger.isTraceEnabled()) { - logger.trace("begin delete key={} at layer={}", toBeDeletedPrimaryKey, layer); + if (logger.isDebugEnabled()) { + logger.debug("begin delete key={} at layer={}", toBeDeletedPrimaryKey, layer); } final Estimator estimator = quantizer.estimator(); final Map> nodeCache = Maps.newConcurrentMap(); @@ -1836,14 +1843,44 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi .thenApply(candidates -> { final ImmutableList.Builder> filteredCandidatesBuilder = ImmutableList.builder(); - for (final NodeReferenceAndNode neighbor : candidates) { + for (final NodeReferenceAndNode candidate : candidates) { // filter out neighbors that happen to be the node we are trying to delete - if (!neighbor.getNodeReference().getPrimaryKey().equals(toBeDeletedPrimaryKey)) { - filteredCandidatesBuilder.add(neighbor); + if (!candidate.getNodeReference().getPrimaryKey().equals(toBeDeletedPrimaryKey)) { + filteredCandidatesBuilder.add(candidate); } } return filteredCandidatesBuilder.build(); }) + .thenApply(candidates -> { + if (logger.isDebugEnabled()) { + final ImmutableList.Builder candidateStringsBuilder = ImmutableList.builder(); + for (final NodeReferenceAndNode candidate : candidates) { + candidateStringsBuilder.add(candidate.getNode().getPrimaryKey().toString()); + } + logger.debug("resolved candidates={}", String.join(",", + candidateStringsBuilder.build())); + } + return candidates; + }) + .thenApply(candidates -> { + for (final NodeReferenceAndNode candidate : candidates) { + final AbstractNode neighbors = candidate.getNode(); + for (final N neighborOfCandidate : neighbors.getNeighbors()) { + if (neighborOfCandidate.getPrimaryKey().equals(toBeDeletedPrimaryKey)) { + // + // Make sure the neighbor pointing to the node-to-be-deleted is deleted as + // well. + // + candidateChangeSetMap.put(neighbors.getPrimaryKey(), + new DeleteNeighborsChangeSet<>( + new BaseNeighborsChangeSet<>(neighbors.getNeighbors()), + ImmutableList.of(toBeDeletedPrimaryKey))); + break; + } + } + } + return candidates; + }) .thenCompose(candidates -> forEach(toBeDeletedNode.getNeighbors(), // for each direct neighbor neighborReference -> @@ -1915,8 +1952,8 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi layer); }); }).thenApply(result -> { - if (logger.isTraceEnabled()) { - logger.trace("end delete key={} at layer={}", toBeDeletedPrimaryKey, layer); + if (logger.isDebugEnabled()) { + logger.debug("end delete key={} at layer={}", toBeDeletedPrimaryKey, layer); } return result; }); @@ -1938,6 +1975,10 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi return fetchNodeIfNotCached(storageAdapter, transaction, storageTransform, layer, neighborReference, nodeCache) .thenCompose(neighborNode -> { + if (neighborNode == null) { + // node could not be fetched; maybe it was deleted already -> ignore + return AsyncUtil.DONE; + } final ImmutableList.Builder candidatesReferencesBuilder = ImmutableList.builder(); final Transformed neighborVector = storageAdapter.getVector(neighborReference, neighborNode); @@ -1972,6 +2013,18 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi final Map> nodeCache) { return selectCandidates(storageAdapter, transaction, storageTransform, estimator, candidates, layer, getConfig().getM(), nodeCache) + .thenApply(selectedCandidates -> { + if (logger.isDebugEnabled()) { + final ImmutableList.Builder candidateStringsBuilder = ImmutableList.builder(); + for (final NodeReferenceAndNode candidate : selectedCandidates) { + candidateStringsBuilder.add(candidate.getNode().getPrimaryKey().toString()); + } + logger.debug("selected for neighbor={}, candidates={}", + neighborReference.getPrimaryKey(), + String.join(",", candidateStringsBuilder.build())); + } + return selectedCandidates; + }) .thenCompose(selectedCandidates -> { // create change sets for each selected neighbor and insert new node into them for (final NodeReferenceAndNode selectedCandidate : selectedCandidates) { diff --git a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java index 1dcf15865e..27a474f2cf 100644 --- a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java +++ b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java @@ -30,6 +30,7 @@ import com.apple.foundationdb.linear.Quantizer; import com.apple.foundationdb.linear.RealVector; import com.apple.foundationdb.linear.StoredVecsIterator; +import com.apple.foundationdb.linear.Transformed; import com.apple.foundationdb.rabitq.EncodedRealVector; import com.apple.foundationdb.test.TestDatabaseExtension; import com.apple.foundationdb.test.TestExecutors; @@ -61,6 +62,8 @@ import javax.annotation.Nonnull; import javax.annotation.Nullable; +import java.io.BufferedWriter; +import java.io.FileWriter; import java.io.IOException; import java.nio.channels.FileChannel; import java.nio.file.Path; @@ -77,6 +80,7 @@ import java.util.Set; import java.util.TreeSet; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import java.util.function.BiFunction; import java.util.stream.Collectors; @@ -230,7 +234,7 @@ void testBasicInsert(final long seed, final boolean useInlining, final boolean e final TreeSet recordsOrderedByDistance = new TreeSet<>(Comparator.comparing(PrimaryKeyVectorAndDistance::getDistance)); - for (int i = 0; i < 1000;) { + for (int i = 0; i < 1000; ) { i += basicInsertBatch(hnsw, 100, i, onReadListener, (tr, nextId) -> { final var primaryKey = createPrimaryKey(nextId); @@ -251,7 +255,7 @@ void testBasicInsert(final long seed, final boolean useInlining, final boolean e // This should not fail but should be silently ignored. If this succeeds, the following searches will all // return records that are not aligned with recordsOrderedByDistance. // - for (int i = 0; i < 100;) { + for (int i = 0; i < 100; ) { i += basicInsertBatch(hnsw, 100, 0, onReadListener, (tr, ignored) -> { final var primaryKey = createPrimaryKey(random.nextInt(1000)); @@ -278,7 +282,7 @@ void testBasicInsert(final long seed, final boolean useInlining, final boolean e logger.info("nodeId ={} at distance={}", resultEntry.getPrimaryKey().getLong(0), resultEntry.getDistance()); if (trueNN.contains(resultEntry.getPrimaryKey())) { - recallCount ++; + recallCount++; } } final double recall = (double)recallCount / (double)k; @@ -302,8 +306,253 @@ void testBasicInsert(final long seed, final boolean useInlining, final boolean e hnsw.scanLayer(db, 1, 100, node -> Assertions.assertThat(readIds.add(node.getPrimaryKey().getLong(0))).isTrue()); Assertions.assertThat(readIds.size()).isBetween(10, 50); + } + + @ParameterizedTest() + @RandomSeedSource({0x0fdbL}) + void testBasicInsertDelete2D(final long seed) throws Exception { + final Random random = new Random(seed); + final Metric metric = Metric.EUCLIDEAN_METRIC; + final TestOnReadListener onReadListener = new TestOnReadListener(); + + final int numDimensions = 2; + final HNSW hnsw = new HNSW(rtSubspace.getSubspace(), TestExecutors.defaultThreadPool(), + HNSW.newConfigBuilder() + .setMetric(metric) + .setUseInlining(false) + .setExtendCandidates(false) + .setKeepPrunedConnections(false) + .setUseRaBitQ(false) + .setRaBitQNumExBits(5) + .setSampleVectorStatsProbability(1.0d) + .setMaintainStatsProbability(0.1d) + .setStatsThreshold(100) + .setM(5) + .setMMax(10) + .setMMax0(10) + .build(numDimensions), + OnWriteListener.NOOP, onReadListener); + + final int k = 50; + final HalfRealVector queryVector = createRandomHalfVector(random, numDimensions); + final TreeSet recordsOrderedByDistance = + new TreeSet<>(Comparator.comparing(PrimaryKeyVectorAndDistance::getDistance)); + + for (int i = 0; i < 1000;) { + i += basicInsertBatch(hnsw, 100, i, onReadListener, + (tr, nextId) -> { + final var primaryKey = createPrimaryKey(nextId); + final HalfRealVector dataVector = createRandomHalfVector(random, numDimensions); + final double distance = metric.distance(dataVector, queryVector); + final PrimaryKeyVectorAndDistance record = + new PrimaryKeyVectorAndDistance(primaryKey, dataVector, distance); + recordsOrderedByDistance.add(record); + if (recordsOrderedByDistance.size() > k) { + recordsOrderedByDistance.pollLast(); + } + return record; + }); + } + + onReadListener.reset(); + final long beginTs = System.nanoTime(); + final List results = + db.run(tr -> + hnsw.kNearestNeighborsSearch(tr, k, 100, true, queryVector).join()); + final long endTs = System.nanoTime(); + + final ImmutableSet trueNN = + recordsOrderedByDistance.stream() + .map(PrimaryKeyVectorAndDistance::getPrimaryKey) + .collect(ImmutableSet.toImmutableSet()); + + int recallCount = 0; + for (ResultEntry resultEntry : results) { + logger.info("nodeId ={} at distance={}", resultEntry.getPrimaryKey().getLong(0), + resultEntry.getDistance()); + if (trueNN.contains(resultEntry.getPrimaryKey())) { + recallCount ++; + } + } + final double recall = (double)recallCount / (double)k; + logger.info("search transaction took elapsedTime={}ms; read nodes={}, read bytes={}, recall={}", + TimeUnit.NANOSECONDS.toMillis(endTs - beginTs), + onReadListener.getNodeCountByLayer(), onReadListener.getBytesReadByLayer(), + String.format(Locale.ROOT, "%.2f", recall * 100.0d)); + Assertions.assertThat(recall).isGreaterThan(0.9); + + final Set insertedIds = + LongStream.range(0, 1000) + .boxed() + .collect(Collectors.toSet()); + + final Set readIds = Sets.newHashSet(); + hnsw.scanLayer(db, 0, 100, + node -> Assertions.assertThat(readIds.add(node.getPrimaryKey().getLong(0))).isTrue()); + Assertions.assertThat(readIds).isEqualTo(insertedIds); + + readIds.clear(); + hnsw.scanLayer(db, 1, 100, + node -> Assertions.assertThat(readIds.add(node.getPrimaryKey().getLong(0))).isTrue()); + //Assertions.assertThat(readIds.size()).isBetween(10, 50); + + int layer = 0; + while (true) { + if (!dumpLayer(hnsw, "before", layer++)) { + break; + } + } db.run(tr -> hnsw.delete(tr, Tuple.from(10L)).join()); + db.run(tr -> hnsw.delete(tr, Tuple.from(777L)).join()); + + layer = 0; + while (true) { + if (!dumpLayer(hnsw, "after", layer++)) { + break; + } + } + } + + @ParameterizedTest() + @RandomSeedSource({0x0fdbL}) + void testBasicInsertDelete502D(final long seed) throws Exception { + final Random random = new Random(seed); + final Metric metric = Metric.EUCLIDEAN_METRIC; + final TestOnReadListener onReadListener = new TestOnReadListener(); + + final int numDimensions = 2; + final HNSW hnsw = new HNSW(rtSubspace.getSubspace(), TestExecutors.defaultThreadPool(), + HNSW.newConfigBuilder() + .setMetric(metric) + .setUseInlining(false) + .setExtendCandidates(false) + .setKeepPrunedConnections(false) + .setUseRaBitQ(false) + .setRaBitQNumExBits(5) + .setSampleVectorStatsProbability(1.0d) + .setMaintainStatsProbability(0.1d) + .setStatsThreshold(100) + .setM(5) + .setMMax(10) + .setMMax0(10) + .build(numDimensions), + OnWriteListener.NOOP, onReadListener); + + final int k = 50; + final HalfRealVector queryVector = createRandomHalfVector(random, numDimensions); + final TreeSet recordsOrderedByDistance = + new TreeSet<>(Comparator.comparing(PrimaryKeyVectorAndDistance::getDistance)); + + for (int i = 0; i < 1000;) { + i += basicInsertBatch(hnsw, 100, i, onReadListener, + (tr, nextId) -> { + final var primaryKey = createPrimaryKey(nextId); + final HalfRealVector dataVector = createRandomHalfVector(random, numDimensions); + final double distance = metric.distance(dataVector, queryVector); + final PrimaryKeyVectorAndDistance record = + new PrimaryKeyVectorAndDistance(primaryKey, dataVector, distance); + recordsOrderedByDistance.add(record); + if (recordsOrderedByDistance.size() > k) { + recordsOrderedByDistance.pollLast(); + } + return record; + }); + } + + onReadListener.reset(); + final long beginTs = System.nanoTime(); + final List results = + db.run(tr -> + hnsw.kNearestNeighborsSearch(tr, k, 100, true, queryVector).join()); + final long endTs = System.nanoTime(); + + final ImmutableSet trueNN = + recordsOrderedByDistance.stream() + .map(PrimaryKeyVectorAndDistance::getPrimaryKey) + .collect(ImmutableSet.toImmutableSet()); + + int recallCount = 0; + for (ResultEntry resultEntry : results) { + logger.info("nodeId ={} at distance={}", resultEntry.getPrimaryKey().getLong(0), + resultEntry.getDistance()); + if (trueNN.contains(resultEntry.getPrimaryKey())) { + recallCount ++; + } + } + final double recall = (double)recallCount / (double)k; + logger.info("search transaction took elapsedTime={}ms; read nodes={}, read bytes={}, recall={}", + TimeUnit.NANOSECONDS.toMillis(endTs - beginTs), + onReadListener.getNodeCountByLayer(), onReadListener.getBytesReadByLayer(), + String.format(Locale.ROOT, "%.2f", recall * 100.0d)); + Assertions.assertThat(recall).isGreaterThan(0.9); + + final Set insertedIds = + LongStream.range(0, 1000) + .boxed() + .collect(Collectors.toSet()); + + final Set readIds = Sets.newHashSet(); + hnsw.scanLayer(db, 0, 100, + node -> Assertions.assertThat(readIds.add(node.getPrimaryKey().getLong(0))).isTrue()); + Assertions.assertThat(readIds).isEqualTo(insertedIds); + + readIds.clear(); + hnsw.scanLayer(db, 1, 100, + node -> Assertions.assertThat(readIds.add(node.getPrimaryKey().getLong(0))).isTrue()); + //Assertions.assertThat(readIds.size()).isBetween(10, 50); + + int layer = 0; + while (true) { + if (!dumpLayer(hnsw, "before50", layer++)) { + break; + } + } + + for (int i = 250; i < 750;) { + for (int b = 0; b < 10; b ++) { + final Tuple primaryKey = Tuple.from((long)i); + db.run(tr -> hnsw.delete(tr, primaryKey).join()); + i++; + } + } + + layer = 0; + while (true) { + if (!dumpLayer(hnsw, "after50", layer++)) { + break; + } + } + } + + private boolean dumpLayer(@Nonnull final HNSW hnsw, @Nonnull final String prefix, final int layer) throws IOException { + final String verticesFileName = "/Users/nseemann/Downloads/vertices-" + prefix + "-" + layer + ".csv"; + final String edgesFileName = "/Users/nseemann/Downloads/edges-" + prefix + "-" + layer + ".csv"; + + final AtomicLong numReadAtomic = new AtomicLong(0L); + try (final BufferedWriter verticesWriter = new BufferedWriter(new FileWriter(verticesFileName)); + final BufferedWriter edgesWriter = new BufferedWriter(new FileWriter(edgesFileName))) { + hnsw.scanLayer(db, layer, 100, node -> { + final CompactNode compactNode = node.asCompactNode(); + final Transformed vector = compactNode.getVector(); + try { + verticesWriter.write(compactNode.getPrimaryKey().getLong(0) + "," + + vector.getUnderlyingVector().getComponent(0) + "," + + vector.getUnderlyingVector().getComponent(1)); + verticesWriter.newLine(); + + for (final var neighbor : compactNode.getNeighbors()) { + edgesWriter.write(compactNode.getPrimaryKey().getLong(0) + "," + + neighbor.getPrimaryKey().getLong(0)); + edgesWriter.newLine(); + } + numReadAtomic.getAndIncrement(); + } catch (final IOException e) { + throw new RuntimeException("unable to write to file", e); + } + }); + } + return numReadAtomic.get() != 0; } @ParameterizedTest() From 3204811c24cfa716226f3dfc387d08f2dee96153 Mon Sep 17 00:00:00 2001 From: Normen Seemann Date: Mon, 8 Dec 2025 15:32:18 +0100 Subject: [PATCH 06/17] deletes work --- .../apple/foundationdb/async/hnsw/HNSW.java | 29 ++- .../async/hnsw/NodeReferenceAndNode.java | 12 +- .../foundationdb/async/hnsw/HNSWTest.java | 180 +++++++++++++++++- 3 files changed, 197 insertions(+), 24 deletions(-) diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java index faea74edca..b9c009e2a8 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java @@ -1520,7 +1520,7 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) * * @return a {@link CompletableFuture} which will complete with a list of fetched nodes */ - private CompletableFuture>> + private CompletableFuture>> neighbors(@Nonnull final StorageAdapter storageAdapter, @Nonnull final ReadTransaction readTransaction, @Nonnull final AffineOperator storageTransform, @@ -1557,7 +1557,7 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) * * @return a {@link CompletableFuture} which will complete with a list of {@link NodeReferenceWithVector} */ - private CompletableFuture> + private CompletableFuture> neighborReferences(@Nonnull final StorageAdapter storageAdapter, @Nonnull final ReadTransaction readTransaction, @Nonnull final AffineOperator storageTransform, @@ -1585,7 +1585,7 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) * * @return a {@link CompletableFuture} which will complete with a set of {@link NodeReferenceWithDistance} */ - private Set + private Set resolveNeighborReferences(@Nonnull final Collection> initialNodeReferenceAndNodes, @Nullable final SplittableRandom random, @Nonnull final HopMode hopMode, @@ -1598,8 +1598,8 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) resultBuilder.add(nodeReferenceAndNode.getNodeReference()); } } - final ImmutableMap> initialNodesMap = initialNodesMapBuilder.build(); + final ImmutableMap> initialNodesMap = initialNodesMapBuilder.build(); final Set nodeReferencesSeen = Sets.newHashSet(); for (final NodeReferenceAndNode nodeReferenceAndNode : initialNodeReferenceAndNodes) { @@ -1819,8 +1819,8 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi @Nonnull final SplittableRandom random, final int layer, @Nonnull final Tuple toBeDeletedPrimaryKey) { - if (logger.isDebugEnabled()) { - logger.debug("begin delete key={} at layer={}", toBeDeletedPrimaryKey, layer); + if (logger.isTraceEnabled()) { + logger.trace("begin delete key={} at layer={}", toBeDeletedPrimaryKey, layer); } final Estimator estimator = quantizer.estimator(); final Map> nodeCache = Maps.newConcurrentMap(); @@ -1829,9 +1829,8 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi return storageAdapter.fetchNode(transaction, storageTransform, layer, toBeDeletedPrimaryKey) .thenCompose(toBeDeletedNode -> { - final NodeReferenceAndNode toBeDeletedNodeReferenceAndNode = - new NodeReferenceAndNode<>(new NodeReferenceWithVector(toBeDeletedPrimaryKey, - toBeDeletedNode.asCompactNode().getVector()), toBeDeletedNode); + final NodeReferenceAndNode toBeDeletedNodeReferenceAndNode = + new NodeReferenceAndNode<>(new NodeReference(toBeDeletedPrimaryKey), toBeDeletedNode); return neighbors(storageAdapter, transaction, storageTransform, random, ImmutableList.of(toBeDeletedNodeReferenceAndNode), HopMode.INCLUSIVE, @@ -1852,12 +1851,12 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi return filteredCandidatesBuilder.build(); }) .thenApply(candidates -> { - if (logger.isDebugEnabled()) { + if (logger.isTraceEnabled()) { final ImmutableList.Builder candidateStringsBuilder = ImmutableList.builder(); for (final NodeReferenceAndNode candidate : candidates) { candidateStringsBuilder.add(candidate.getNode().getPrimaryKey().toString()); } - logger.debug("resolved candidates={}", String.join(",", + logger.trace("resolved candidates={}", String.join(",", candidateStringsBuilder.build())); } return candidates; @@ -1952,8 +1951,8 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi layer); }); }).thenApply(result -> { - if (logger.isDebugEnabled()) { - logger.debug("end delete key={} at layer={}", toBeDeletedPrimaryKey, layer); + if (logger.isTraceEnabled()) { + logger.trace("end delete key={} at layer={}", toBeDeletedPrimaryKey, layer); } return result; }); @@ -2014,12 +2013,12 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi return selectCandidates(storageAdapter, transaction, storageTransform, estimator, candidates, layer, getConfig().getM(), nodeCache) .thenApply(selectedCandidates -> { - if (logger.isDebugEnabled()) { + if (logger.isTraceEnabled()) { final ImmutableList.Builder candidateStringsBuilder = ImmutableList.builder(); for (final NodeReferenceAndNode candidate : selectedCandidates) { candidateStringsBuilder.add(candidate.getNode().getPrimaryKey().toString()); } - logger.debug("selected for neighbor={}, candidates={}", + logger.trace("selected for neighbor={}, candidates={}", neighborReference.getPrimaryKey(), String.join(",", candidateStringsBuilder.build())); } diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NodeReferenceAndNode.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NodeReferenceAndNode.java index 96c152fcc9..1317a77081 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NodeReferenceAndNode.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NodeReferenceAndNode.java @@ -33,22 +33,22 @@ * pieces of information together. * @param the type of {@link NodeReference} used within the {@link AbstractNode} */ -class NodeReferenceAndNode { +class NodeReferenceAndNode { @Nonnull - private final T nodeReferenceWithDistance; + private final T nodeReference; @Nonnull private final AbstractNode node; /** * Constructs a new instance that pairs a node reference (with distance) with its * corresponding {@link AbstractNode} object. - * @param nodeReferenceWithDistance the reference to a node, which also includes distance information. Must not be + * @param nodeReference the reference to a node, which also includes distance information. Must not be * {@code null}. * @param node the actual {@link AbstractNode} object that the reference points to. Must not be {@code null}. */ - public NodeReferenceAndNode(@Nonnull final T nodeReferenceWithDistance, + public NodeReferenceAndNode(@Nonnull final T nodeReference, @Nonnull final AbstractNode node) { - this.nodeReferenceWithDistance = nodeReferenceWithDistance; + this.nodeReference = nodeReference; this.node = node; } @@ -58,7 +58,7 @@ public NodeReferenceAndNode(@Nonnull final T nodeReferenceWithDistance, */ @Nonnull public T getNodeReference() { - return nodeReferenceWithDistance; + return nodeReference; } /** diff --git a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java index 27a474f2cf..ea617851b9 100644 --- a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java +++ b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java @@ -274,6 +274,7 @@ void testBasicInsert(final long seed, final boolean useInlining, final boolean e final ImmutableSet trueNN = recordsOrderedByDistance.stream() + .limit(k) .map(PrimaryKeyVectorAndDistance::getPrimaryKey) .collect(ImmutableSet.toImmutableSet()); @@ -363,6 +364,7 @@ void testBasicInsertDelete2D(final long seed) throws Exception { final ImmutableSet trueNN = recordsOrderedByDistance.stream() + .limit(k) .map(PrimaryKeyVectorAndDistance::getPrimaryKey) .collect(ImmutableSet.toImmutableSet()); @@ -469,6 +471,7 @@ void testBasicInsertDelete502D(final long seed) throws Exception { final ImmutableSet trueNN = recordsOrderedByDistance.stream() + .limit(k) .map(PrimaryKeyVectorAndDistance::getPrimaryKey) .collect(ImmutableSet.toImmutableSet()); @@ -523,6 +526,172 @@ void testBasicInsertDelete502D(final long seed) throws Exception { break; } } + + onReadListener.reset(); + final List resultsAfterDeletes = + db.run(tr -> + hnsw.kNearestNeighborsSearch(tr, k, 100, true, queryVector).join()); + + final ImmutableSet trueAfterDeletesNN = + recordsOrderedByDistance.stream() + .map(PrimaryKeyVectorAndDistance::getPrimaryKey) + .filter(primaryKey -> primaryKey.getLong(0) < 250 || primaryKey.getLong(0) >= 750) + .limit(k) + .collect(ImmutableSet.toImmutableSet()); + + int recallCountAfterDeletes = 0; + for (ResultEntry resultEntry : resultsAfterDeletes) { + logger.info("nodeId ={} at distance={}", resultEntry.getPrimaryKey().getLong(0), + resultEntry.getDistance()); + if (trueAfterDeletesNN.contains(resultEntry.getPrimaryKey())) { + recallCountAfterDeletes ++; + } + } + final double recallAfterDeletes = (double)recallCountAfterDeletes / (double)k; + logger.info("search transaction took elapsedTime={}ms; read nodes={}, read bytes={}, recall={}", + TimeUnit.NANOSECONDS.toMillis(endTs - beginTs), + onReadListener.getNodeCountByLayer(), onReadListener.getBytesReadByLayer(), + String.format(Locale.ROOT, "%.2f", recallAfterDeletes * 100.0d)); + Assertions.assertThat(recallAfterDeletes).isGreaterThan(0.9); + } + + @ParameterizedTest() + @RandomSeedSource({0x0fdbL}) + void testBasicInsertDelete503D(final long seed) throws Exception { + final Random random = new Random(seed); + final Metric metric = Metric.EUCLIDEAN_METRIC; + final TestOnReadListener onReadListener = new TestOnReadListener(); + + final int numDimensions = 3; + final HNSW hnsw = new HNSW(rtSubspace.getSubspace(), TestExecutors.defaultThreadPool(), + HNSW.newConfigBuilder() + .setMetric(metric) + .setUseInlining(false) + .setExtendCandidates(false) + .setKeepPrunedConnections(false) + .setUseRaBitQ(false) + .setRaBitQNumExBits(5) + .setSampleVectorStatsProbability(1.0d) + .setMaintainStatsProbability(0.1d) + .setStatsThreshold(100) + .setM(5) + .setMMax(10) + .setMMax0(10) + .build(numDimensions), + OnWriteListener.NOOP, onReadListener); + + final int k = 50; + final HalfRealVector queryVector = createRandomHalfVector(random, numDimensions); + final TreeSet recordsOrderedByDistance = + new TreeSet<>(Comparator.comparing(PrimaryKeyVectorAndDistance::getDistance)); + + for (int i = 0; i < 1000;) { + i += basicInsertBatch(hnsw, 100, i, onReadListener, + (tr, nextId) -> { + final var primaryKey = createPrimaryKey(nextId); + final HalfRealVector dataVector = createRandomHalfVector(random, numDimensions); + final double distance = metric.distance(dataVector, queryVector); + final PrimaryKeyVectorAndDistance record = + new PrimaryKeyVectorAndDistance(primaryKey, dataVector, distance); + recordsOrderedByDistance.add(record); + if (recordsOrderedByDistance.size() > k) { + recordsOrderedByDistance.pollLast(); + } + return record; + }); + } + + onReadListener.reset(); + final long beginTs = System.nanoTime(); + final List results = + db.run(tr -> + hnsw.kNearestNeighborsSearch(tr, k, 100, true, queryVector).join()); + final long endTs = System.nanoTime(); + + final ImmutableSet trueNN = + recordsOrderedByDistance.stream() + .limit(k) + .map(PrimaryKeyVectorAndDistance::getPrimaryKey) + .collect(ImmutableSet.toImmutableSet()); + + int recallCount = 0; + for (ResultEntry resultEntry : results) { + logger.info("nodeId ={} at distance={}", resultEntry.getPrimaryKey().getLong(0), + resultEntry.getDistance()); + if (trueNN.contains(resultEntry.getPrimaryKey())) { + recallCount ++; + } + } + final double recall = (double)recallCount / (double)k; + logger.info("search transaction took elapsedTime={}ms; read nodes={}, read bytes={}, recall={}", + TimeUnit.NANOSECONDS.toMillis(endTs - beginTs), + onReadListener.getNodeCountByLayer(), onReadListener.getBytesReadByLayer(), + String.format(Locale.ROOT, "%.2f", recall * 100.0d)); + Assertions.assertThat(recall).isGreaterThan(0.9); + + final Set insertedIds = + LongStream.range(0, 1000) + .boxed() + .collect(Collectors.toSet()); + + final Set readIds = Sets.newHashSet(); + hnsw.scanLayer(db, 0, 100, + node -> Assertions.assertThat(readIds.add(node.getPrimaryKey().getLong(0))).isTrue()); + Assertions.assertThat(readIds).isEqualTo(insertedIds); + + readIds.clear(); + hnsw.scanLayer(db, 1, 100, + node -> Assertions.assertThat(readIds.add(node.getPrimaryKey().getLong(0))).isTrue()); + //Assertions.assertThat(readIds.size()).isBetween(10, 50); + + int layer = 0; + while (true) { + if (!dumpLayer(hnsw, "before503D", layer++)) { + break; + } + } + + for (int i = 250; i < 750;) { + for (int b = 0; b < 10; b ++) { + final Tuple primaryKey = Tuple.from((long)i); + db.run(tr -> hnsw.delete(tr, primaryKey).join()); + i++; + } + } + + layer = 0; + while (true) { + if (!dumpLayer(hnsw, "after503D", layer++)) { + break; + } + } + + onReadListener.reset(); + final List resultsAfterDeletes = + db.run(tr -> + hnsw.kNearestNeighborsSearch(tr, k, 100, true, queryVector).join()); + + final ImmutableSet trueAfterDeletesNN = + recordsOrderedByDistance.stream() + .map(PrimaryKeyVectorAndDistance::getPrimaryKey) + .filter(primaryKey -> primaryKey.getLong(0) < 250 || primaryKey.getLong(0) >= 750) + .limit(k) + .collect(ImmutableSet.toImmutableSet()); + + int recallCountAfterDeletes = 0; + for (ResultEntry resultEntry : resultsAfterDeletes) { + logger.info("nodeId ={} at distance={}", resultEntry.getPrimaryKey().getLong(0), + resultEntry.getDistance()); + if (trueAfterDeletesNN.contains(resultEntry.getPrimaryKey())) { + recallCountAfterDeletes ++; + } + } + final double recallAfterDeletes = (double)recallCountAfterDeletes / (double)k; + logger.info("search transaction took elapsedTime={}ms; read nodes={}, read bytes={}, recall={}", + TimeUnit.NANOSECONDS.toMillis(endTs - beginTs), + onReadListener.getNodeCountByLayer(), onReadListener.getBytesReadByLayer(), + String.format(Locale.ROOT, "%.2f", recallAfterDeletes * 100.0d)); + Assertions.assertThat(recallAfterDeletes).isGreaterThan(0.9); } private boolean dumpLayer(@Nonnull final HNSW hnsw, @Nonnull final String prefix, final int layer) throws IOException { @@ -536,9 +705,14 @@ private boolean dumpLayer(@Nonnull final HNSW hnsw, @Nonnull final String prefix final CompactNode compactNode = node.asCompactNode(); final Transformed vector = compactNode.getVector(); try { - verticesWriter.write(compactNode.getPrimaryKey().getLong(0) + "," + - vector.getUnderlyingVector().getComponent(0) + "," + - vector.getUnderlyingVector().getComponent(1)); + verticesWriter.write(compactNode.getPrimaryKey().getLong(0) + ","); + final RealVector realVector = vector.getUnderlyingVector(); + for (int i = 0; i < realVector.getNumDimensions(); i++) { + if (i != 0) { + verticesWriter.write(","); + } + verticesWriter.write(String.valueOf(realVector.getComponent(i))); + } verticesWriter.newLine(); for (final var neighbor : compactNode.getNeighbors()) { From 88b94c5b0b2df722bed8c615c2b4fe92abe5d7f2 Mon Sep 17 00:00:00 2001 From: Normen Seemann Date: Wed, 10 Dec 2025 22:23:43 +0100 Subject: [PATCH 07/17] update to junit 5.14; test refactorings --- build/reports/problems/problems-report.html | 663 +++++++++++++++++ .../apple/foundationdb/async/hnsw/Config.java | 2 +- .../apple/foundationdb/async/hnsw/HNSW.java | 188 ++--- .../async/hnsw/NodeReferenceAndNode.java | 7 +- .../foundationdb/async/hnsw/HNSWTest.java | 699 +++++++----------- .../VectorIndexScanComparisons.java | 18 +- .../indexes/VectorIndexMaintainer.java | 2 +- .../record/metadata/MetaDataProtoTest.java | 4 +- .../RemoteFetchIndexScanTest.java | 2 +- .../RemoteFetchMultiColumnKeyTest.java | 2 +- .../RemoteFetchSplitRecordsTest.java | 2 +- .../foundationdb/RemoteFetchTest.java | 2 +- .../indexes/VersionIndexTest.java | 2 +- .../plan/cascades/ArithmeticValueTest.java | 4 +- .../query/plan/cascades/BooleanValueTest.java | 7 +- .../plan/cascades/LikeOperatorValueTest.java | 7 +- .../record/query/plan/cascades/TypeTest.java | 7 +- .../cascades/VariadicFunctionValueTest.java | 4 +- .../query/plan/plans/ExplodePlanTest.java | 4 +- .../relational/api/ddl/SqlFunctionTest.java | 2 +- .../autotest/engine/AutoTestDescriptor.java | 11 +- .../autotest/engine/AutoTestEngine.java | 9 +- .../engine/WorkloadTestDescriptor.java | 13 +- .../recordlayer/PlanGenerationStackTest.java | 4 +- .../recordlayer/query/QueryTypeTests.java | 4 +- .../apple/test/BooleanArgumentsProvider.java | 4 +- .../com/apple/test/RandomSeedProvider.java | 4 +- gradle/libs.versions.toml | 6 +- 28 files changed, 1111 insertions(+), 572 deletions(-) create mode 100644 build/reports/problems/problems-report.html diff --git a/build/reports/problems/problems-report.html b/build/reports/problems/problems-report.html new file mode 100644 index 0000000000..e05028f0f7 --- /dev/null +++ b/build/reports/problems/problems-report.html @@ -0,0 +1,663 @@ + + + + + + + + + + + + + Gradle Configuration Cache + + + +

+ +
+ Loading... +
+ + + + + + diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/Config.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/Config.java index 82b945785d..66d4322ebf 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/Config.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/Config.java @@ -322,7 +322,7 @@ public int hashCode() { @Override @Nonnull public String toString() { - return "Config[" + ", metric=" + getMetric() + ", numDimensions=" + getNumDimensions() + + return "Config[" + "metric=" + getMetric() + ", numDimensions=" + getNumDimensions() + ", isUseInlining=" + isUseInlining() + ", M=" + getM() + ", MMax=" + getMMax() + ", MMax0=" + getMMax0() + ", efConstruction=" + getEfConstruction() + ", isExtendCandidates=" + isExtendCandidates() + diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java index b9c009e2a8..08292cb18e 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java @@ -1482,9 +1482,9 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) if (isExtendCandidates) { return neighborReferences(storageAdapter, readTransaction, storageTransform, null, candidates, - HopMode.INCLUSIVE, CandidateSamplingPredicate.tautology(), layer, nodeCache) - .thenApply(hop2 -> { - for (final NodeReferenceWithVector nodeReferenceWithVector : hop2) { + CandidateSamplingPredicate.tautology(), layer, nodeCache) + .thenApply(neighborsOfCandidates -> { + for (final NodeReferenceWithVector nodeReferenceWithVector : neighborsOfCandidates) { final double distance = estimator.distance(nodeReferenceWithVector.getVector(), vector); resultBuilder.add(new NodeReferenceWithDistance(nodeReferenceWithVector.getPrimaryKey(), nodeReferenceWithVector.getVector(), distance)); @@ -1526,15 +1526,14 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) @Nonnull final AffineOperator storageTransform, @Nonnull final SplittableRandom random, @Nonnull final Collection> initialNodeReferenceAndNodes, - @Nonnull final HopMode hopMode, @Nonnull final CandidateSamplingPredicate samplingPredicate, final int layer, @Nonnull final Map> nodeCache) { return neighborReferences(storageAdapter, readTransaction, storageTransform, random, - initialNodeReferenceAndNodes, hopMode, samplingPredicate, layer, nodeCache) - .thenCompose(neighborsFirstDegree -> + initialNodeReferenceAndNodes, samplingPredicate, layer, nodeCache) + .thenCompose(neighbors -> fetchSomeNodesIfNotCached(storageAdapter, readTransaction, storageTransform, layer, - neighborsFirstDegree, nodeCache)); + neighbors, nodeCache)); } /** @@ -1550,7 +1549,6 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) * @param random a {@link SplittableRandom} to be used for sampling * @param initialNodeReferenceAndNodes an {@link Iterable} of initial candidate nodes, which have already been * evaluated - * @param hopMode the {@link HopMode} we should use * @param samplingPredicate a predicate that restricts the number of neighbors to be fetched * @param layer the graph layer from which to fetch nodes * @param nodeCache a cache mapping primary keys to {@link AbstractNode} objects to avoid redundant fetches @@ -1563,12 +1561,11 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) @Nonnull final AffineOperator storageTransform, @Nullable final SplittableRandom random, @Nonnull final Collection> initialNodeReferenceAndNodes, - @Nonnull final HopMode hopMode, @Nonnull final CandidateSamplingPredicate samplingPredicate, final int layer, @Nonnull final Map> nodeCache) { final Iterable toBeFetched = - resolveNeighborReferences(initialNodeReferenceAndNodes, random, hopMode, samplingPredicate); + resolveNeighborReferences(initialNodeReferenceAndNodes, random, samplingPredicate); return fetchNeighborhoodReferences(storageAdapter, readTransaction, storageTransform, layer, toBeFetched, nodeCache); } @@ -1588,15 +1585,12 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) private Set resolveNeighborReferences(@Nonnull final Collection> initialNodeReferenceAndNodes, @Nullable final SplittableRandom random, - @Nonnull final HopMode hopMode, @Nonnull final CandidateSamplingPredicate samplingPredicate) { final ImmutableSet.Builder resultBuilder = ImmutableSet.builder(); final ImmutableMap.Builder> initialNodesMapBuilder = ImmutableMap.builder(); for (final NodeReferenceAndNode nodeReferenceAndNode : initialNodeReferenceAndNodes) { initialNodesMapBuilder.put(nodeReferenceAndNode.getNode().getPrimaryKey(), nodeReferenceAndNode); - if (hopMode == HopMode.INCLUSIVE) { - resultBuilder.add(nodeReferenceAndNode.getNodeReference()); - } + resultBuilder.add(nodeReferenceAndNode.getNodeReference()); } final ImmutableMap> initialNodesMap = initialNodesMapBuilder.build(); @@ -1611,16 +1605,16 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) continue; } + // + // We need to distinguish between initial node references and non-initial node references: + // Initial nodes references are of type T (and sometimes already contain a vector in which case + // we do not want to refetch the node later if we don't have to). The initial nodes already have been + // added earlier in this method (with or without a vector). The neighbors that are not initial most + // likely do not contain a vector which is fine but if T != N, we need to be careful in order to not + // create duplicates in this set. + // @Nullable final NodeReferenceAndNode initialNode = initialNodesMap.get(neighborPrimaryKey); - if (initialNode != null) { - // - // This is an initial node which happens to be a neighbor of another initial node. We already have - // everything we need to put this node into the result without fetching it. - // - if (hopMode != HopMode.EXCLUSIVE) { - resultBuilder.add(initialNode.getNodeReference()); - } - } else if (!nodeReferencesSeen.contains(neighborPrimaryKey)) { + if (initialNode == null && !nodeReferencesSeen.contains(neighborPrimaryKey)) { // // This is a node that is currently not known to us. It is not an initial node. We need to fetch it, // and we need to mark it as seen so we won't consider it more than once. @@ -1779,8 +1773,8 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi @Nonnull final SplittableRandom random, @Nonnull final Tuple primaryKey, final int topLayer) { - if (logger.isDebugEnabled()) { - logger.debug("nearest entry point for deleteFromLayers at topLayer={} is at key={}", topLayer, primaryKey); + if (logger.isTraceEnabled()) { + logger.trace("nearest entry point for deleteFromLayers at topLayer={} is at key={}", topLayer, primaryKey); } return MoreAsyncUtil.forEach(() -> IntStream.rangeClosed(0, topLayer).iterator(), @@ -1832,35 +1826,8 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi final NodeReferenceAndNode toBeDeletedNodeReferenceAndNode = new NodeReferenceAndNode<>(new NodeReference(toBeDeletedPrimaryKey), toBeDeletedNode); - return neighbors(storageAdapter, transaction, storageTransform, random, - ImmutableList.of(toBeDeletedNodeReferenceAndNode), HopMode.INCLUSIVE, - CandidateSamplingPredicate.tautology(), layer, nodeCache) - .thenCompose(candidates -> - neighbors(storageAdapter, transaction, storageTransform, random, - candidates, HopMode.INCLUSIVE, - this::shouldSampleCandidate, layer, nodeCache)) - .thenApply(candidates -> { - final ImmutableList.Builder> filteredCandidatesBuilder = - ImmutableList.builder(); - for (final NodeReferenceAndNode candidate : candidates) { - // filter out neighbors that happen to be the node we are trying to delete - if (!candidate.getNodeReference().getPrimaryKey().equals(toBeDeletedPrimaryKey)) { - filteredCandidatesBuilder.add(candidate); - } - } - return filteredCandidatesBuilder.build(); - }) - .thenApply(candidates -> { - if (logger.isTraceEnabled()) { - final ImmutableList.Builder candidateStringsBuilder = ImmutableList.builder(); - for (final NodeReferenceAndNode candidate : candidates) { - candidateStringsBuilder.add(candidate.getNode().getPrimaryKey().toString()); - } - logger.trace("resolved candidates={}", String.join(",", - candidateStringsBuilder.build())); - } - return candidates; - }) + return candidatesForRepairs(storageAdapter, transaction, storageTransform, random, layer, + toBeDeletedNodeReferenceAndNode, nodeCache) .thenApply(candidates -> { for (final NodeReferenceAndNode candidate : candidates) { final AbstractNode neighbors = candidate.getNode(); @@ -1958,6 +1925,47 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi }); } + @Nonnull + private CompletableFuture>> + candidatesForRepairs(final @Nonnull StorageAdapter storageAdapter, + final @Nonnull Transaction transaction, + final @Nonnull AffineOperator storageTransform, + final @Nonnull SplittableRandom random, + final int layer, + final NodeReferenceAndNode toBeDeletedNodeReferenceAndNode, + final Map> nodeCache) { + return neighbors(storageAdapter, transaction, storageTransform, random, + ImmutableList.of(toBeDeletedNodeReferenceAndNode), + CandidateSamplingPredicate.tautology(), layer, nodeCache) + .thenCompose(candidates -> + neighbors(storageAdapter, transaction, storageTransform, random, + candidates, + this::shouldSampleCandidate, layer, nodeCache)) + .thenApply(candidates -> { + final ImmutableList.Builder> filteredCandidatesBuilder = + ImmutableList.builder(); + for (final NodeReferenceAndNode candidate : candidates) { + // filter out neighbors that happen to be the node we are trying to delete + if (!candidate.getNodeReference().getPrimaryKey() + .equals(toBeDeletedNodeReferenceAndNode.getNode().getPrimaryKey())) { + filteredCandidatesBuilder.add(candidate); + } + } + return filteredCandidatesBuilder.build(); + }) + .thenApply(candidates -> { + if (logger.isTraceEnabled()) { + final ImmutableList.Builder candidateStringsBuilder = ImmutableList.builder(); + for (final NodeReferenceAndNode candidate : candidates) { + candidateStringsBuilder.add(candidate.getNode().getPrimaryKey().toString()); + } + logger.trace("resolved candidates={}", String.join(",", + candidateStringsBuilder.build())); + } + return candidates; + }); + } + private @Nonnull CompletableFuture prepareCandidatesAndRepairNeighbor(@Nonnull final StorageAdapter storageAdapter, @Nonnull final Transaction transaction, @@ -2061,11 +2069,14 @@ layer, getConfig().getM(), nodeCache) * found in the layer. */ @VisibleForTesting - void scanLayer(@Nonnull final Database db, - final int layer, - final int batchSize, - @Nonnull final Consumer> nodeConsumer) { - final StorageAdapter storageAdapter = getStorageAdapterForLayer(layer); + static void scanLayer(@Nonnull final Config config, + @Nonnull final Subspace subspace, + @Nonnull final Database db, + final int layer, + final int batchSize, + @Nonnull final Consumer> nodeConsumer) { + final StorageAdapter storageAdapter = + storageAdapterForLayer(config, subspace, OnWriteListener.NOOP, OnReadListener.NOOP, layer); final AtomicReference lastPrimaryKeyAtomic = new AtomicReference<>(); Tuple newPrimaryKey; do { @@ -2078,7 +2089,7 @@ void scanLayer(@Nonnull final Database db, lastPrimaryKeyAtomic.set(node.getPrimaryKey()); }); return lastPrimaryKeyAtomic.get(); - }, executor); + }); } while (newPrimaryKey != null); } @@ -2089,17 +2100,12 @@ void scanLayer(@Nonnull final Database db, * use an {@code InliningStorageAdapter} for layers greater than {@code 0} and a {@code CompactStorageAdapter} for * layer 0. Note that we will only use inlining at all if the config indicates we should use inlining. * - * @param layer the layer number for which to get the storage adapter; currently unused - * @return a non-null {@link StorageAdapter} instance, which will always be a - * {@link CompactStorageAdapter} in the current implementation + * @param layer the layer number for which to get the storage adapter + * @return a non-null {@link StorageAdapter} instance */ @Nonnull private StorageAdapter getStorageAdapterForLayer(final int layer) { - return config.isUseInlining() && layer > 0 - ? new InliningStorageAdapter(getConfig(), InliningNode.factory(), getSubspace(), getOnWriteListener(), - getOnReadListener()) - : new CompactStorageAdapter(getConfig(), CompactNode.factory(), getSubspace(), getOnWriteListener(), - getOnReadListener()); + return storageAdapterForLayer(getConfig(), getSubspace(), getOnWriteListener(), getOnReadListener(), layer); } @Nonnull @@ -2142,6 +2148,33 @@ private boolean shouldMaintainStats(@Nonnull final SplittableRandom random) { return random.nextDouble() < getConfig().getMaintainStatsProbability(); } + /** + * Gets the appropriate storage adapter for a given layer. + *

+ * This method selects a {@link StorageAdapter} implementation based on the layer number. The logic is intended to + * use an {@code InliningStorageAdapter} for layers greater than {@code 0} and a {@code CompactStorageAdapter} for + * layer 0. Note that we will only use inlining at all if the config indicates we should use inlining. + * + * @param config the config to use + * @param subspace the subspace of the HNSW object itself + * @param onWriteListener a listener that the new {@link StorageAdapter} will call back for any write events + * @param onReadListener a listener that the new {@link StorageAdapter} will call back for any read events + * @param layer the layer number for which to get the storage adapter + * @return a non-null {@link StorageAdapter} instance + */ + @Nonnull + @VisibleForTesting + static StorageAdapter + storageAdapterForLayer(@Nonnull final Config config, + @Nonnull final Subspace subspace, + @Nonnull final OnWriteListener onWriteListener, + @Nonnull final OnReadListener onReadListener, + final int layer) { + return config.isUseInlining() && layer > 0 + ? new InliningStorageAdapter(config, InliningNode.factory(), subspace, onWriteListener, onReadListener) + : new CompactStorageAdapter(config, CompactNode.factory(), subspace, onWriteListener, onReadListener); + } + private static double splitMixDouble(final long x) { return (splitMixLong(x) >>> 11) * 0x1.0p-53; } @@ -2163,25 +2196,6 @@ private static List drain(@Nonnull Queue queue) { return resultBuilder.build(); } - /** - * Let {@code I} be the set of initial nodes for {@link #neighborReferences}. Let {@code H(I)} be the set of nodes that can be - * reached by traversing the neighbors of nodes in {@code I}. - */ - private enum HopMode { - /** - * Return {@code I union H(I)}. - */ - INCLUSIVE, - /** - * Return {@code H(I) \ I}. - */ - EXCLUSIVE, - /** - * Return {@code H(I)}. - */ - EXCLUSIVE_ALL - } - @FunctionalInterface private interface CandidateSamplingPredicate { @Nonnull diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NodeReferenceAndNode.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NodeReferenceAndNode.java index 1317a77081..f64759b36e 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NodeReferenceAndNode.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NodeReferenceAndNode.java @@ -26,7 +26,7 @@ import java.util.List; /** - * A container class that pairs a {@link NodeReferenceWithDistance} with its corresponding {@link AbstractNode} object. + * A container class that pairs a {@link NodeReference} with its corresponding {@link AbstractNode} object. *

* This is often used during graph traversal or searching, where a reference to a node (along with its distance from a * query point) is first identified, and then the complete node data is fetched. This class holds these two related @@ -70,6 +70,11 @@ public AbstractNode getNode() { return node; } + @Override + public String toString() { + return "NB[" + nodeReference + "," + node + ']'; + } + /** * Helper to extract the references from a given collection of objects of this container class. * @param referencesAndNodes an iterable of {@link NodeReferenceAndNode} objects from which to extract the diff --git a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java index ea617851b9..9b8be31711 100644 --- a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java +++ b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java @@ -43,18 +43,23 @@ import com.google.common.base.Verify; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.ObjectArrays; import com.google.common.collect.Sets; import org.assertj.core.api.Assertions; -import org.assertj.core.util.Lists; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.AfterTestExecutionCallback; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.ExtensionContext; import org.junit.jupiter.api.extension.RegisterExtension; import org.junit.jupiter.api.parallel.Execution; import org.junit.jupiter.api.parallel.ExecutionMode; +import org.junit.jupiter.params.ParameterInfo; import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.aggregator.ArgumentsAccessor; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; import org.slf4j.Logger; @@ -70,12 +75,15 @@ import java.nio.file.Paths; import java.nio.file.StandardOpenOption; import java.util.ArrayList; +import java.util.Collection; import java.util.Comparator; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.NavigableSet; import java.util.Objects; +import java.util.Optional; import java.util.Random; import java.util.Set; import java.util.TreeSet; @@ -83,6 +91,7 @@ import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import java.util.function.BiFunction; +import java.util.function.Consumer; import java.util.stream.Collectors; import java.util.stream.LongStream; import java.util.stream.Stream; @@ -194,43 +203,42 @@ void testInliningSerialization(final long seed) { )).join()); } - static Stream randomSeedsWithOptions() { + static Stream randomSeedsWithConfig() { return RandomizedTestUtils.randomSeeds(0xdeadc0deL) .flatMap(seed -> Sets.cartesianProduct(ImmutableSet.of(true, false), ImmutableSet.of(true, false), ImmutableSet.of(true, false), ImmutableSet.of(true, false)).stream() - .map(arguments -> Arguments.of(ObjectArrays.concat(seed, arguments.toArray())))); + .map(arguments -> Arguments.of(ObjectArrays.concat(seed, + new Object[] {HNSW.newConfigBuilder() + .setMetric(Metric.EUCLIDEAN_METRIC) + .setUseInlining(arguments.get(0)) + .setExtendCandidates(arguments.get(1)) + .setKeepPrunedConnections(arguments.get(2)) + .setUseRaBitQ(arguments.get(3)) + .setRaBitQNumExBits(5) + .setSampleVectorStatsProbability(1.0d) + .setMaintainStatsProbability(0.1d) + .setStatsThreshold(100) + .setM(32) + .setMMax(32) + .setMMax0(64) + .build(128)})))); } - @ParameterizedTest(name = "seed={0} useInlining={1} extendCandidates={2} keepPrunedConnections={3} useRaBitQ={4}") - @MethodSource("randomSeedsWithOptions") - void testBasicInsert(final long seed, final boolean useInlining, final boolean extendCandidates, - final boolean keepPrunedConnections, final boolean useRaBitQ) { + @ParameterizedTest + @MethodSource("randomSeedsWithConfig") + void testBasicInsert(final long seed, final Config config) { final Random random = new Random(seed); - final Metric metric = Metric.EUCLIDEAN_METRIC; + final Metric metric = config.getMetric(); final TestOnReadListener onReadListener = new TestOnReadListener(); - final int numDimensions = 128; - final HNSW hnsw = new HNSW(rtSubspace.getSubspace(), TestExecutors.defaultThreadPool(), - HNSW.newConfigBuilder() - .setMetric(metric) - .setUseInlining(useInlining) - .setExtendCandidates(extendCandidates) - .setKeepPrunedConnections(keepPrunedConnections) - .setUseRaBitQ(useRaBitQ) - .setRaBitQNumExBits(5) - .setSampleVectorStatsProbability(1.0d) - .setMaintainStatsProbability(0.1d) - .setStatsThreshold(100) - .setM(32) - .setMMax(32) - .setMMax0(64) - .build(numDimensions), - OnWriteListener.NOOP, onReadListener); + final HNSW hnsw = + new HNSW(rtSubspace.getSubspace(), TestExecutors.defaultThreadPool(), config, + OnWriteListener.NOOP, onReadListener); final int k = 50; - final HalfRealVector queryVector = createRandomHalfVector(random, numDimensions); + final HalfRealVector queryVector = createRandomHalfVector(random, config.getNumDimensions()); final TreeSet recordsOrderedByDistance = new TreeSet<>(Comparator.comparing(PrimaryKeyVectorAndDistance::getDistance)); @@ -238,7 +246,7 @@ void testBasicInsert(final long seed, final boolean useInlining, final boolean e i += basicInsertBatch(hnsw, 100, i, onReadListener, (tr, nextId) -> { final var primaryKey = createPrimaryKey(nextId); - final HalfRealVector dataVector = createRandomHalfVector(random, numDimensions); + final HalfRealVector dataVector = createRandomHalfVector(random, config.getNumDimensions()); final double distance = metric.distance(dataVector, queryVector); final PrimaryKeyVectorAndDistance record = new PrimaryKeyVectorAndDistance(primaryKey, dataVector, distance); @@ -259,7 +267,7 @@ void testBasicInsert(final long seed, final boolean useInlining, final boolean e i += basicInsertBatch(hnsw, 100, 0, onReadListener, (tr, ignored) -> { final var primaryKey = createPrimaryKey(random.nextInt(1000)); - final HalfRealVector dataVector = createRandomHalfVector(random, numDimensions); + final HalfRealVector dataVector = createRandomHalfVector(random, config.getNumDimensions()); final double distance = metric.distance(dataVector, queryVector); return new PrimaryKeyVectorAndDistance(primaryKey, dataVector, distance); }); @@ -299,434 +307,113 @@ void testBasicInsert(final long seed, final boolean useInlining, final boolean e .collect(Collectors.toSet()); final Set readIds = Sets.newHashSet(); - hnsw.scanLayer(db, 0, 100, + scanLayer(config, 0, 100, node -> Assertions.assertThat(readIds.add(node.getPrimaryKey().getLong(0))).isTrue()); Assertions.assertThat(readIds).isEqualTo(insertedIds); readIds.clear(); - hnsw.scanLayer(db, 1, 100, + scanLayer(config, 1, 100, node -> Assertions.assertThat(readIds.add(node.getPrimaryKey().getLong(0))).isTrue()); Assertions.assertThat(readIds.size()).isBetween(10, 50); } - @ParameterizedTest() - @RandomSeedSource({0x0fdbL}) - void testBasicInsertDelete2D(final long seed) throws Exception { - final Random random = new Random(seed); - final Metric metric = Metric.EUCLIDEAN_METRIC; - final TestOnReadListener onReadListener = new TestOnReadListener(); - - final int numDimensions = 2; - final HNSW hnsw = new HNSW(rtSubspace.getSubspace(), TestExecutors.defaultThreadPool(), - HNSW.newConfigBuilder() - .setMetric(metric) - .setUseInlining(false) - .setExtendCandidates(false) - .setKeepPrunedConnections(false) - .setUseRaBitQ(false) - .setRaBitQNumExBits(5) - .setSampleVectorStatsProbability(1.0d) - .setMaintainStatsProbability(0.1d) - .setStatsThreshold(100) - .setM(5) - .setMMax(10) - .setMMax0(10) - .build(numDimensions), - OnWriteListener.NOOP, onReadListener); - - final int k = 50; - final HalfRealVector queryVector = createRandomHalfVector(random, numDimensions); - final TreeSet recordsOrderedByDistance = - new TreeSet<>(Comparator.comparing(PrimaryKeyVectorAndDistance::getDistance)); - - for (int i = 0; i < 1000;) { - i += basicInsertBatch(hnsw, 100, i, onReadListener, - (tr, nextId) -> { - final var primaryKey = createPrimaryKey(nextId); - final HalfRealVector dataVector = createRandomHalfVector(random, numDimensions); - final double distance = metric.distance(dataVector, queryVector); - final PrimaryKeyVectorAndDistance record = - new PrimaryKeyVectorAndDistance(primaryKey, dataVector, distance); - recordsOrderedByDistance.add(record); - if (recordsOrderedByDistance.size() > k) { - recordsOrderedByDistance.pollLast(); - } - return record; - }); - } - - onReadListener.reset(); - final long beginTs = System.nanoTime(); - final List results = - db.run(tr -> - hnsw.kNearestNeighborsSearch(tr, k, 100, true, queryVector).join()); - final long endTs = System.nanoTime(); - - final ImmutableSet trueNN = - recordsOrderedByDistance.stream() - .limit(k) - .map(PrimaryKeyVectorAndDistance::getPrimaryKey) - .collect(ImmutableSet.toImmutableSet()); - - int recallCount = 0; - for (ResultEntry resultEntry : results) { - logger.info("nodeId ={} at distance={}", resultEntry.getPrimaryKey().getLong(0), - resultEntry.getDistance()); - if (trueNN.contains(resultEntry.getPrimaryKey())) { - recallCount ++; - } - } - final double recall = (double)recallCount / (double)k; - logger.info("search transaction took elapsedTime={}ms; read nodes={}, read bytes={}, recall={}", - TimeUnit.NANOSECONDS.toMillis(endTs - beginTs), - onReadListener.getNodeCountByLayer(), onReadListener.getBytesReadByLayer(), - String.format(Locale.ROOT, "%.2f", recall * 100.0d)); - Assertions.assertThat(recall).isGreaterThan(0.9); - - final Set insertedIds = - LongStream.range(0, 1000) - .boxed() - .collect(Collectors.toSet()); - - final Set readIds = Sets.newHashSet(); - hnsw.scanLayer(db, 0, 100, - node -> Assertions.assertThat(readIds.add(node.getPrimaryKey().getLong(0))).isTrue()); - Assertions.assertThat(readIds).isEqualTo(insertedIds); - - readIds.clear(); - hnsw.scanLayer(db, 1, 100, - node -> Assertions.assertThat(readIds.add(node.getPrimaryKey().getLong(0))).isTrue()); - //Assertions.assertThat(readIds.size()).isBetween(10, 50); - - int layer = 0; - while (true) { - if (!dumpLayer(hnsw, "before", layer++)) { - break; - } - } - - db.run(tr -> hnsw.delete(tr, Tuple.from(10L)).join()); - db.run(tr -> hnsw.delete(tr, Tuple.from(777L)).join()); - - layer = 0; - while (true) { - if (!dumpLayer(hnsw, "after", layer++)) { - break; - } - } - } - - @ParameterizedTest() - @RandomSeedSource({0x0fdbL}) - void testBasicInsertDelete502D(final long seed) throws Exception { + @ExtendWith(HNSWTest.DumpLayersIfFailure.class) + @ParameterizedTest + @MethodSource("randomSeedsWithConfig") + void testBasicInsertDelete(final long seed, final Config config) { final Random random = new Random(seed); - final Metric metric = Metric.EUCLIDEAN_METRIC; + final int size = 1000; final TestOnReadListener onReadListener = new TestOnReadListener(); - final int numDimensions = 2; - final HNSW hnsw = new HNSW(rtSubspace.getSubspace(), TestExecutors.defaultThreadPool(), - HNSW.newConfigBuilder() - .setMetric(metric) - .setUseInlining(false) - .setExtendCandidates(false) - .setKeepPrunedConnections(false) - .setUseRaBitQ(false) - .setRaBitQNumExBits(5) - .setSampleVectorStatsProbability(1.0d) - .setMaintainStatsProbability(0.1d) - .setStatsThreshold(100) - .setM(5) - .setMMax(10) - .setMMax0(10) - .build(numDimensions), + final HNSW hnsw = new HNSW(rtSubspace.getSubspace(), TestExecutors.defaultThreadPool(), config, OnWriteListener.NOOP, onReadListener); final int k = 50; - final HalfRealVector queryVector = createRandomHalfVector(random, numDimensions); - final TreeSet recordsOrderedByDistance = - new TreeSet<>(Comparator.comparing(PrimaryKeyVectorAndDistance::getDistance)); + final List insertedData = randomVectors(random, config.getNumDimensions(), 1000); - for (int i = 0; i < 1000;) { + for (int i = 0; i < size;) { i += basicInsertBatch(hnsw, 100, i, onReadListener, - (tr, nextId) -> { - final var primaryKey = createPrimaryKey(nextId); - final HalfRealVector dataVector = createRandomHalfVector(random, numDimensions); - final double distance = metric.distance(dataVector, queryVector); - final PrimaryKeyVectorAndDistance record = - new PrimaryKeyVectorAndDistance(primaryKey, dataVector, distance); - recordsOrderedByDistance.add(record); - if (recordsOrderedByDistance.size() > k) { - recordsOrderedByDistance.pollLast(); - } - return record; - }); + (tr, nextId) -> insertedData.get(Math.toIntExact(nextId))); } - onReadListener.reset(); - final long beginTs = System.nanoTime(); - final List results = - db.run(tr -> - hnsw.kNearestNeighborsSearch(tr, k, 100, true, queryVector).join()); - final long endTs = System.nanoTime(); + final int numVectorsPerDeleteBatch = 100; + List remainingData = insertedData; + do { + final List toBeDeleted = + pickRandomVectors(random, remainingData, numVectorsPerDeleteBatch); - final ImmutableSet trueNN = - recordsOrderedByDistance.stream() - .limit(k) - .map(PrimaryKeyVectorAndDistance::getPrimaryKey) - .collect(ImmutableSet.toImmutableSet()); - - int recallCount = 0; - for (ResultEntry resultEntry : results) { - logger.info("nodeId ={} at distance={}", resultEntry.getPrimaryKey().getLong(0), - resultEntry.getDistance()); - if (trueNN.contains(resultEntry.getPrimaryKey())) { - recallCount ++; - } - } - final double recall = (double)recallCount / (double)k; - logger.info("search transaction took elapsedTime={}ms; read nodes={}, read bytes={}, recall={}", - TimeUnit.NANOSECONDS.toMillis(endTs - beginTs), - onReadListener.getNodeCountByLayer(), onReadListener.getBytesReadByLayer(), - String.format(Locale.ROOT, "%.2f", recall * 100.0d)); - Assertions.assertThat(recall).isGreaterThan(0.9); - - final Set insertedIds = - LongStream.range(0, 1000) - .boxed() - .collect(Collectors.toSet()); - - final Set readIds = Sets.newHashSet(); - hnsw.scanLayer(db, 0, 100, - node -> Assertions.assertThat(readIds.add(node.getPrimaryKey().getLong(0))).isTrue()); - Assertions.assertThat(readIds).isEqualTo(insertedIds); - - readIds.clear(); - hnsw.scanLayer(db, 1, 100, - node -> Assertions.assertThat(readIds.add(node.getPrimaryKey().getLong(0))).isTrue()); - //Assertions.assertThat(readIds.size()).isBetween(10, 50); - - int layer = 0; - while (true) { - if (!dumpLayer(hnsw, "before50", layer++)) { - break; - } - } - - for (int i = 250; i < 750;) { - for (int b = 0; b < 10; b ++) { - final Tuple primaryKey = Tuple.from((long)i); - db.run(tr -> hnsw.delete(tr, primaryKey).join()); - i++; - } - } - - layer = 0; - while (true) { - if (!dumpLayer(hnsw, "after50", layer++)) { - break; - } - } - - onReadListener.reset(); - final List resultsAfterDeletes = - db.run(tr -> - hnsw.kNearestNeighborsSearch(tr, k, 100, true, queryVector).join()); - - final ImmutableSet trueAfterDeletesNN = - recordsOrderedByDistance.stream() - .map(PrimaryKeyVectorAndDistance::getPrimaryKey) - .filter(primaryKey -> primaryKey.getLong(0) < 250 || primaryKey.getLong(0) >= 750) - .limit(k) - .collect(ImmutableSet.toImmutableSet()); - - int recallCountAfterDeletes = 0; - for (ResultEntry resultEntry : resultsAfterDeletes) { - logger.info("nodeId ={} at distance={}", resultEntry.getPrimaryKey().getLong(0), - resultEntry.getDistance()); - if (trueAfterDeletesNN.contains(resultEntry.getPrimaryKey())) { - recallCountAfterDeletes ++; - } - } - final double recallAfterDeletes = (double)recallCountAfterDeletes / (double)k; - logger.info("search transaction took elapsedTime={}ms; read nodes={}, read bytes={}, recall={}", - TimeUnit.NANOSECONDS.toMillis(endTs - beginTs), - onReadListener.getNodeCountByLayer(), onReadListener.getBytesReadByLayer(), - String.format(Locale.ROOT, "%.2f", recallAfterDeletes * 100.0d)); - Assertions.assertThat(recallAfterDeletes).isGreaterThan(0.9); - } - - @ParameterizedTest() - @RandomSeedSource({0x0fdbL}) - void testBasicInsertDelete503D(final long seed) throws Exception { - final Random random = new Random(seed); - final Metric metric = Metric.EUCLIDEAN_METRIC; - final TestOnReadListener onReadListener = new TestOnReadListener(); - - final int numDimensions = 3; - final HNSW hnsw = new HNSW(rtSubspace.getSubspace(), TestExecutors.defaultThreadPool(), - HNSW.newConfigBuilder() - .setMetric(metric) - .setUseInlining(false) - .setExtendCandidates(false) - .setKeepPrunedConnections(false) - .setUseRaBitQ(false) - .setRaBitQNumExBits(5) - .setSampleVectorStatsProbability(1.0d) - .setMaintainStatsProbability(0.1d) - .setStatsThreshold(100) - .setM(5) - .setMMax(10) - .setMMax0(10) - .build(numDimensions), - OnWriteListener.NOOP, onReadListener); - - final int k = 50; - final HalfRealVector queryVector = createRandomHalfVector(random, numDimensions); - final TreeSet recordsOrderedByDistance = - new TreeSet<>(Comparator.comparing(PrimaryKeyVectorAndDistance::getDistance)); - - for (int i = 0; i < 1000;) { - i += basicInsertBatch(hnsw, 100, i, onReadListener, - (tr, nextId) -> { - final var primaryKey = createPrimaryKey(nextId); - final HalfRealVector dataVector = createRandomHalfVector(random, numDimensions); - final double distance = metric.distance(dataVector, queryVector); - final PrimaryKeyVectorAndDistance record = - new PrimaryKeyVectorAndDistance(primaryKey, dataVector, distance); - recordsOrderedByDistance.add(record); - if (recordsOrderedByDistance.size() > k) { - recordsOrderedByDistance.pollLast(); - } - return record; - }); - } + onReadListener.reset(); - onReadListener.reset(); - final long beginTs = System.nanoTime(); - final List results = - db.run(tr -> - hnsw.kNearestNeighborsSearch(tr, k, 100, true, queryVector).join()); - final long endTs = System.nanoTime(); + long beginTs = System.nanoTime(); + db.run(tr -> { + for (final PrimaryKeyAndVector primaryKeyAndVector : toBeDeleted) { + hnsw.delete(tr, primaryKeyAndVector.getPrimaryKey()).join(); + } + return null; + }); + long endTs = System.nanoTime(); - final ImmutableSet trueNN = - recordsOrderedByDistance.stream() - .limit(k) - .map(PrimaryKeyVectorAndDistance::getPrimaryKey) - .collect(ImmutableSet.toImmutableSet()); + logger.info("delete transaction of {} records after {} records took elapsedTime={}ms; read nodes={}, read bytes={}", + numVectorsPerDeleteBatch, + size - remainingData.size(), + TimeUnit.NANOSECONDS.toMillis(endTs - beginTs), + onReadListener.getNodeCountByLayer(), onReadListener.getBytesReadByLayer()); - int recallCount = 0; - for (ResultEntry resultEntry : results) { - logger.info("nodeId ={} at distance={}", resultEntry.getPrimaryKey().getLong(0), - resultEntry.getDistance()); - if (trueNN.contains(resultEntry.getPrimaryKey())) { - recallCount ++; - } - } - final double recall = (double)recallCount / (double)k; - logger.info("search transaction took elapsedTime={}ms; read nodes={}, read bytes={}, recall={}", - TimeUnit.NANOSECONDS.toMillis(endTs - beginTs), - onReadListener.getNodeCountByLayer(), onReadListener.getBytesReadByLayer(), - String.format(Locale.ROOT, "%.2f", recall * 100.0d)); - Assertions.assertThat(recall).isGreaterThan(0.9); + final Set deletedSet = toBeDeleted.stream().collect(ImmutableSet.toImmutableSet()); + remainingData = remainingData.stream() + .filter(vector -> !deletedSet.contains(vector)) + .collect(ImmutableList.toImmutableList()); - final Set insertedIds = - LongStream.range(0, 1000) - .boxed() - .collect(Collectors.toSet()); + if (!remainingData.isEmpty()) { + final HalfRealVector queryVector = createRandomHalfVector(random, config.getNumDimensions()); + final ImmutableSet trueNN = + orderedByDistances(Metric.EUCLIDEAN_METRIC, remainingData, queryVector).stream() + .limit(k) + .map(PrimaryKeyVectorAndDistance::getPrimaryKey) + .collect(ImmutableSet.toImmutableSet()); - final Set readIds = Sets.newHashSet(); - hnsw.scanLayer(db, 0, 100, - node -> Assertions.assertThat(readIds.add(node.getPrimaryKey().getLong(0))).isTrue()); - Assertions.assertThat(readIds).isEqualTo(insertedIds); - - readIds.clear(); - hnsw.scanLayer(db, 1, 100, - node -> Assertions.assertThat(readIds.add(node.getPrimaryKey().getLong(0))).isTrue()); - //Assertions.assertThat(readIds.size()).isBetween(10, 50); + onReadListener.reset(); - int layer = 0; - while (true) { - if (!dumpLayer(hnsw, "before503D", layer++)) { - break; - } - } + beginTs = System.nanoTime(); + final List results = + db.run(tr -> + hnsw.kNearestNeighborsSearch(tr, k, 100, true, queryVector).join()); + endTs = System.nanoTime(); - for (int i = 250; i < 750;) { - for (int b = 0; b < 10; b ++) { - final Tuple primaryKey = Tuple.from((long)i); - db.run(tr -> hnsw.delete(tr, primaryKey).join()); - i++; - } - } + int recallCount = 0; + for (ResultEntry resultEntry : results) { + if (trueNN.contains(resultEntry.getPrimaryKey())) { + recallCount++; + } + } + final double recall = (double)recallCount / (double)trueNN.size(); + +// if (recall == 0.7) { +// int layer = 0; +// while (true) { +// if (!dumpLayer(hnsw, "debug", layer++)) { +// break; +// } +// } +// } + + logger.info("search transaction after delete of {} records took elapsedTime={}ms; read nodes={}, read bytes={}, recall={}", + size - remainingData.size(), + TimeUnit.NANOSECONDS.toMillis(endTs - beginTs), + onReadListener.getNodeCountByLayer(), onReadListener.getBytesReadByLayer(), + String.format(Locale.ROOT, "%.2f", recall * 100.0d)); + Assertions.assertThat(recall).isGreaterThan(0.9); - layer = 0; - while (true) { - if (!dumpLayer(hnsw, "after503D", layer++)) { - break; + final long remainingNumNodes = countNodesOnLayer(config, 0); + Assertions.assertThat(remainingNumNodes).isEqualTo(remainingData.size()); } - } - - onReadListener.reset(); - final List resultsAfterDeletes = - db.run(tr -> - hnsw.kNearestNeighborsSearch(tr, k, 100, true, queryVector).join()); - - final ImmutableSet trueAfterDeletesNN = - recordsOrderedByDistance.stream() - .map(PrimaryKeyVectorAndDistance::getPrimaryKey) - .filter(primaryKey -> primaryKey.getLong(0) < 250 || primaryKey.getLong(0) >= 750) - .limit(k) - .collect(ImmutableSet.toImmutableSet()); + } while (!remainingData.isEmpty()); - int recallCountAfterDeletes = 0; - for (ResultEntry resultEntry : resultsAfterDeletes) { - logger.info("nodeId ={} at distance={}", resultEntry.getPrimaryKey().getLong(0), - resultEntry.getDistance()); - if (trueAfterDeletesNN.contains(resultEntry.getPrimaryKey())) { - recallCountAfterDeletes ++; - } - } - final double recallAfterDeletes = (double)recallCountAfterDeletes / (double)k; - logger.info("search transaction took elapsedTime={}ms; read nodes={}, read bytes={}, recall={}", - TimeUnit.NANOSECONDS.toMillis(endTs - beginTs), - onReadListener.getNodeCountByLayer(), onReadListener.getBytesReadByLayer(), - String.format(Locale.ROOT, "%.2f", recallAfterDeletes * 100.0d)); - Assertions.assertThat(recallAfterDeletes).isGreaterThan(0.9); - } - - private boolean dumpLayer(@Nonnull final HNSW hnsw, @Nonnull final String prefix, final int layer) throws IOException { - final String verticesFileName = "/Users/nseemann/Downloads/vertices-" + prefix + "-" + layer + ".csv"; - final String edgesFileName = "/Users/nseemann/Downloads/edges-" + prefix + "-" + layer + ".csv"; - - final AtomicLong numReadAtomic = new AtomicLong(0L); - try (final BufferedWriter verticesWriter = new BufferedWriter(new FileWriter(verticesFileName)); - final BufferedWriter edgesWriter = new BufferedWriter(new FileWriter(edgesFileName))) { - hnsw.scanLayer(db, layer, 100, node -> { - final CompactNode compactNode = node.asCompactNode(); - final Transformed vector = compactNode.getVector(); - try { - verticesWriter.write(compactNode.getPrimaryKey().getLong(0) + ","); - final RealVector realVector = vector.getUnderlyingVector(); - for (int i = 0; i < realVector.getNumDimensions(); i++) { - if (i != 0) { - verticesWriter.write(","); - } - verticesWriter.write(String.valueOf(realVector.getComponent(i))); - } - verticesWriter.newLine(); - - for (final var neighbor : compactNode.getNeighbors()) { - edgesWriter.write(compactNode.getPrimaryKey().getLong(0) + "," + - neighbor.getPrimaryKey().getLong(0)); - edgesWriter.newLine(); - } - numReadAtomic.getAndIncrement(); - } catch (final IOException e) { - throw new RuntimeException("unable to write to file", e); - } - }); - } - return numReadAtomic.get() != 0; + final var accessInfo = + db.run(transaction -> StorageAdapter.fetchAccessInfo(hnsw.getConfig(), + transaction, hnsw.getSubspace(), OnReadListener.NOOP).join()); + Assertions.assertThat(accessInfo).isNull(); + Assertions.assertThat((Double)null).isNotNull(); } @ParameterizedTest() @@ -781,7 +468,7 @@ void testBasicInsertWithRaBitQEncodings(final long seed) { // that transformations/reconstructions are applied properly. // final Map fromDBMap = Maps.newHashMap(); - hnsw.scanLayer(db, 0, 100, + scanLayer(hnsw.getConfig(), 0, 100, node -> fromDBMap.put(node.getPrimaryKey(), node.asCompactNode().getVector().getUnderlyingVector())); @@ -962,6 +649,95 @@ private void validateSIFTSmall(@Nonnull final HNSW hnsw, @Nonnull final Map randomVectors(@Nonnull final Random random, final int numDimensions, + final int numberOfVectors) { + final ImmutableList.Builder resultBuilder = ImmutableList.builder(); + for (int i = 0; i < numberOfVectors; i ++) { + final var primaryKey = createPrimaryKey(i); + final HalfRealVector dataVector = createRandomHalfVector(random, numDimensions); + resultBuilder.add(new PrimaryKeyAndVector(primaryKey, dataVector)); + } + return resultBuilder.build(); + } + + @Nonnull + private List pickRandomVectors(@Nonnull final Random random, + @Nonnull final Collection vectors, + final int numberOfVectors) { + Verify.verify(numberOfVectors <= vectors.size()); + final List remainingVectors = Lists.newArrayList(vectors); + final ImmutableList.Builder resultBuilder = ImmutableList.builder(); + for (int i = 0; i < numberOfVectors; i ++) { + resultBuilder.add(remainingVectors.remove(random.nextInt(remainingVectors.size()))); + } + return resultBuilder.build(); + } + + @Nonnull + private NavigableSet orderedByDistances(@Nonnull final Metric metric, + @Nonnull final List vectors, + @Nonnull final HalfRealVector queryVector) { + final TreeSet vectorsOrderedByDistance = + new TreeSet<>(Comparator.comparing(PrimaryKeyVectorAndDistance::getDistance)); + for (final PrimaryKeyAndVector vector : vectors) { + final double distance = metric.distance(vector.getVector(), queryVector); + final PrimaryKeyVectorAndDistance record = + new PrimaryKeyVectorAndDistance(vector.getPrimaryKey(), vector.getVector(), distance); + vectorsOrderedByDistance.add(record); + } + return vectorsOrderedByDistance; + } + + private long countNodesOnLayer(@Nonnull final Config config, final int layer) { + final AtomicLong counter = new AtomicLong(); + scanLayer(config, layer, 100, node -> counter.incrementAndGet()); + return counter.get(); + } + + private void scanLayer(@Nonnull final Config config, + final int layer, + final int batchSize, + @Nonnull final Consumer> nodeConsumer) { + HNSW.scanLayer(config, rtSubspace.getSubspace(), db, layer, batchSize, nodeConsumer); + } + + private boolean dumpLayer(@Nonnull final Config config, + @Nonnull final String prefix, final int layer) throws IOException { + final String verticesFileName = "/Users/nseemann/Downloads/vertices-" + prefix + "-" + layer + ".csv"; + final String edgesFileName = "/Users/nseemann/Downloads/edges-" + prefix + "-" + layer + ".csv"; + + final AtomicLong numReadAtomic = new AtomicLong(0L); + try (final BufferedWriter verticesWriter = new BufferedWriter(new FileWriter(verticesFileName)); + final BufferedWriter edgesWriter = new BufferedWriter(new FileWriter(edgesFileName))) { + scanLayer(config, layer, 100, node -> { + final CompactNode compactNode = node.asCompactNode(); + final Transformed vector = compactNode.getVector(); + try { + verticesWriter.write(compactNode.getPrimaryKey().getLong(0) + ","); + final RealVector realVector = vector.getUnderlyingVector(); + for (int i = 0; i < realVector.getNumDimensions(); i++) { + if (i != 0) { + verticesWriter.write(","); + } + verticesWriter.write(String.valueOf(realVector.getComponent(i))); + } + verticesWriter.newLine(); + + for (final var neighbor : compactNode.getNeighbors()) { + edgesWriter.write(compactNode.getPrimaryKey().getLong(0) + "," + + neighbor.getPrimaryKey().getLong(0)); + edgesWriter.newLine(); + } + numReadAtomic.getAndIncrement(); + } catch (final IOException e) { + throw new RuntimeException("unable to write to file", e); + } + }); + } + return numReadAtomic.get() != 0; + } + private void writeNode(@Nonnull final Transaction transaction, @Nonnull final StorageAdapter storageAdapter, @Nonnull final AbstractNode node, @@ -1027,6 +803,42 @@ private static Tuple createPrimaryKey(final long nextId) { return Tuple.from(nextId); } + public static class DumpLayersIfFailure implements AfterTestExecutionCallback { + @Override + public void afterTestExecution(@Nonnull final ExtensionContext context) { + final Optional failure = context.getExecutionException(); + if (failure.isEmpty()) { + return; + } + + final ParameterInfo parameterInfo = ParameterInfo.get(context); + + if (parameterInfo != null) { + final ArgumentsAccessor args = parameterInfo.getArguments(); + + final HNSWTest hnswTest = (HNSWTest)context.getRequiredTestInstance(); + final Config config = (Config)args.get(1); + logger.error("dumping contents of HNSW to disk"); + dumpLayers(hnswTest, config); + } else { + logger.error("test failed with no parameterized arguments (non-parameterized test or older JUnit)."); + } + } + + private void dumpLayers(@Nonnull final HNSWTest hnswTest, @Nonnull final Config config) { + int layer = 0; + while (true) { + try { + if (!hnswTest.dumpLayer(config, "debug", layer++)) { + break; + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + } + private static class TestOnReadListener implements OnReadListener { final Map nodeCountByLayer; final Map sumMByLayer; @@ -1090,6 +902,20 @@ public Tuple getPrimaryKey() { public RealVector getVector() { return vector; } + + @Override + public boolean equals(final Object o) { + if (o == null || getClass() != o.getClass()) { + return false; + } + final PrimaryKeyAndVector that = (PrimaryKeyAndVector)o; + return Objects.equals(getPrimaryKey(), that.getPrimaryKey()) && Objects.equals(getVector(), that.getVector()); + } + + @Override + public int hashCode() { + return Objects.hash(getPrimaryKey(), getVector()); + } } private static class PrimaryKeyVectorAndDistance extends PrimaryKeyAndVector { @@ -1105,5 +931,22 @@ public PrimaryKeyVectorAndDistance(@Nonnull final Tuple primaryKey, public double getDistance() { return distance; } + + @Override + public boolean equals(final Object o) { + if (o == null || getClass() != o.getClass()) { + return false; + } + if (!super.equals(o)) { + return false; + } + final PrimaryKeyVectorAndDistance that = (PrimaryKeyVectorAndDistance)o; + return Double.compare(getDistance(), that.getDistance()) == 0; + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), getDistance()); + } } } diff --git a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/VectorIndexScanComparisons.java b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/VectorIndexScanComparisons.java index d2cc648f3e..bee821db00 100644 --- a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/VectorIndexScanComparisons.java +++ b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/VectorIndexScanComparisons.java @@ -62,9 +62,9 @@ public class VectorIndexScanComparisons implements IndexScanParameters { @Nonnull private final VectorIndexScanOptions vectorIndexScanOptions; - public VectorIndexScanComparisons(@Nonnull final ScanComparisons prefixScanComparisons, - @Nonnull final DistanceRankValueComparison distanceRankValueComparison, - @Nonnull final VectorIndexScanOptions vectorIndexScanOptions) { + private VectorIndexScanComparisons(@Nonnull final ScanComparisons prefixScanComparisons, + @Nonnull final DistanceRankValueComparison distanceRankValueComparison, + @Nonnull final VectorIndexScanOptions vectorIndexScanOptions) { this.prefixScanComparisons = prefixScanComparisons; this.distanceRankValueComparison = distanceRankValueComparison; this.vectorIndexScanOptions = vectorIndexScanOptions; @@ -311,14 +311,12 @@ public static VectorIndexScanComparisons fromProto(@Nonnull final PlanSerializat } @Nonnull - public static VectorIndexScanComparisons byDistance(@Nullable ScanComparisons prefixScanComparisons, + public static VectorIndexScanComparisons byDistance(@Nullable final ScanComparisons prefixScanComparisons, @Nonnull final DistanceRankValueComparison distanceRankValueComparison, - @Nonnull VectorIndexScanOptions vectorIndexScanOptions) { - if (prefixScanComparisons == null) { - prefixScanComparisons = ScanComparisons.EMPTY; - } - - return new VectorIndexScanComparisons(prefixScanComparisons, distanceRankValueComparison, + @Nonnull final VectorIndexScanOptions vectorIndexScanOptions) { + return new VectorIndexScanComparisons( + prefixScanComparisons == null ? ScanComparisons.EMPTY : prefixScanComparisons, + distanceRankValueComparison, vectorIndexScanOptions); } diff --git a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexMaintainer.java b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexMaintainer.java index e5cb5bc996..d63176aab2 100644 --- a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexMaintainer.java +++ b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexMaintainer.java @@ -346,7 +346,7 @@ protected CompletableFuture updateIndexKeys(@Nonnull f final HNSW hnsw = new HNSW(rtSubspace, getExecutor(), getConfig(), new OnWrite(timer), OnReadListener.NOOP); if (remove) { - throw new UnsupportedOperationException("not implemented"); + return hnsw.delete(state.transaction, trimmedPrimaryKey); } else { return hnsw.insert(state.transaction, trimmedPrimaryKey, RealVector.fromBytes(vectorBytes)); diff --git a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/metadata/MetaDataProtoTest.java b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/metadata/MetaDataProtoTest.java index 34c64f23e3..b6865c91f9 100644 --- a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/metadata/MetaDataProtoTest.java +++ b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/metadata/MetaDataProtoTest.java @@ -58,6 +58,7 @@ import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.ArgumentsProvider; import org.junit.jupiter.params.provider.ArgumentsSource; +import org.junit.jupiter.params.support.ParameterDeclarations; import javax.annotation.Nonnull; import java.util.Arrays; @@ -376,7 +377,8 @@ public void indexGroupingCompatibility() throws Exception { private static class ArgumentProvider implements ArgumentsProvider { @Override - public Stream provideArguments(final ExtensionContext context) throws Exception { + public Stream provideArguments(final ParameterDeclarations parameterDeclarations, + final ExtensionContext context) { return Stream.of(Arguments.of("double parameter", 10.10d, 12, 12d), Arguments.of("float parameter", 11.11f, 13.13f), Arguments.of("long parameter", 42L, 44L), diff --git a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/RemoteFetchIndexScanTest.java b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/RemoteFetchIndexScanTest.java index becfdf0e45..0481501131 100644 --- a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/RemoteFetchIndexScanTest.java +++ b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/RemoteFetchIndexScanTest.java @@ -66,7 +66,7 @@ import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assumptions.assumeTrue; -import static org.junit.jupiter.params.ParameterizedTest.ARGUMENTS_WITH_NAMES_PLACEHOLDER; +import static org.junit.jupiter.params.ParameterizedInvocationConstants.ARGUMENTS_WITH_NAMES_PLACEHOLDER; /** * A test for the remote fetch index scan wrapper. diff --git a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/RemoteFetchMultiColumnKeyTest.java b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/RemoteFetchMultiColumnKeyTest.java index 32cb1dac6f..a4dc79ff6b 100644 --- a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/RemoteFetchMultiColumnKeyTest.java +++ b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/RemoteFetchMultiColumnKeyTest.java @@ -45,7 +45,7 @@ import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.equalTo; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.params.ParameterizedTest.ARGUMENTS_WITH_NAMES_PLACEHOLDER; +import static org.junit.jupiter.params.ParameterizedInvocationConstants.ARGUMENTS_WITH_NAMES_PLACEHOLDER; /** * Remote fetch test with a compound primary key. diff --git a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/RemoteFetchSplitRecordsTest.java b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/RemoteFetchSplitRecordsTest.java index 70c8e1ef9b..6be4ea3e0a 100644 --- a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/RemoteFetchSplitRecordsTest.java +++ b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/RemoteFetchSplitRecordsTest.java @@ -33,7 +33,7 @@ import javax.annotation.Nonnull; -import static org.junit.jupiter.params.ParameterizedTest.ARGUMENTS_WITH_NAMES_PLACEHOLDER; +import static org.junit.jupiter.params.ParameterizedInvocationConstants.ARGUMENTS_WITH_NAMES_PLACEHOLDER; /** * A test for the Remote Fetch with large records that are split (more than just the version split). diff --git a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/RemoteFetchTest.java b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/RemoteFetchTest.java index 152e5c871e..8096cb282f 100644 --- a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/RemoteFetchTest.java +++ b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/RemoteFetchTest.java @@ -63,7 +63,7 @@ import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assumptions.assumeTrue; -import static org.junit.jupiter.params.ParameterizedTest.ARGUMENTS_WITH_NAMES_PLACEHOLDER; +import static org.junit.jupiter.params.ParameterizedInvocationConstants.ARGUMENTS_WITH_NAMES_PLACEHOLDER; /** * A test for the remote fetch feature. diff --git a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VersionIndexTest.java b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VersionIndexTest.java index 4d53b14828..23839bee9d 100644 --- a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VersionIndexTest.java +++ b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VersionIndexTest.java @@ -146,7 +146,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; import static org.junit.jupiter.api.Assumptions.assumeTrue; -import static org.junit.jupiter.params.ParameterizedTest.ARGUMENTS_PLACEHOLDER; +import static org.junit.jupiter.params.ParameterizedInvocationConstants.ARGUMENTS_PLACEHOLDER; /** * Tests for {@code VERSION} type indexes. diff --git a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/cascades/ArithmeticValueTest.java b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/cascades/ArithmeticValueTest.java index c06062d56b..e191f93a1d 100644 --- a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/cascades/ArithmeticValueTest.java +++ b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/cascades/ArithmeticValueTest.java @@ -41,6 +41,7 @@ import org.junit.jupiter.params.provider.ArgumentsProvider; import org.junit.jupiter.params.provider.ArgumentsSource; import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.support.ParameterDeclarations; import java.util.List; import java.util.Optional; @@ -76,7 +77,8 @@ class ArithmeticValueTest { static class BinaryPredicateTestProvider implements ArgumentsProvider { @Override - public Stream provideArguments(final ExtensionContext context) { + public Stream provideArguments(final ParameterDeclarations parameterDeclarations, + final ExtensionContext context) { return Stream.of( Arguments.of(List.of(INT_1, INT_1), new ArithmeticValue.AddFn(), 2, false), Arguments.of(List.of(INT_1, INT_1), new ArithmeticValue.SubFn(), 0, false), diff --git a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/cascades/BooleanValueTest.java b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/cascades/BooleanValueTest.java index 39599c9c43..43d419bb3e 100644 --- a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/cascades/BooleanValueTest.java +++ b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/cascades/BooleanValueTest.java @@ -56,6 +56,7 @@ import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.ArgumentsProvider; import org.junit.jupiter.params.provider.ArgumentsSource; +import org.junit.jupiter.params.support.ParameterDeclarations; import javax.annotation.Nonnull; import java.nio.charset.StandardCharsets; @@ -133,7 +134,8 @@ class BooleanValueTest { static class BinaryPredicateTestProvider implements ArgumentsProvider { @Override - public Stream provideArguments(final ExtensionContext context) { + public Stream provideArguments(final ParameterDeclarations parameterDeclarations, + final ExtensionContext context) { return Stream.of( Arguments.of(List.of(BOOL_TRUE, BOOL_TRUE), new RelOpValue.EqualsFn(), ConstantPredicate.TRUE), Arguments.of(List.of(BOOL_FALSE, BOOL_TRUE), new RelOpValue.EqualsFn(), ConstantPredicate.FALSE), @@ -885,7 +887,8 @@ public Stream provideArguments(final ExtensionContext conte static class LazyBinaryPredicateTestProvider implements ArgumentsProvider { @Override - public Stream provideArguments(final ExtensionContext context) { + public Stream provideArguments(final ParameterDeclarations parameterDeclarations, + final ExtensionContext context) { return Stream.of( /* lazy evaluation tests */ Arguments.of(List.of(new RelOpValue.NotEqualsFn().encapsulate(List.of(INT_1, INT_1)), diff --git a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/cascades/LikeOperatorValueTest.java b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/cascades/LikeOperatorValueTest.java index 3d87a6e26d..ddc952fc91 100644 --- a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/cascades/LikeOperatorValueTest.java +++ b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/cascades/LikeOperatorValueTest.java @@ -44,6 +44,7 @@ import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.ArgumentsProvider; import org.junit.jupiter.params.provider.ArgumentsSource; +import org.junit.jupiter.params.support.ParameterDeclarations; import javax.annotation.Nonnull; import java.util.Arrays; @@ -69,7 +70,8 @@ class LikeOperatorValueTest { static class InvalidInputArgumentsProvider implements ArgumentsProvider { @Override - public Stream provideArguments(final ExtensionContext context) { + public Stream provideArguments(final ParameterDeclarations parameterDeclarations, + final ExtensionContext context) { return Stream.of( Arguments.of(INT_1, INT_1, STRING_NULL), Arguments.of(LONG_1, LONG_1, STRING_NULL), @@ -94,7 +96,8 @@ public Stream provideArguments(final ExtensionContext conte static class ValidInputArgumentsProvider implements ArgumentsProvider { @Override - public Stream provideArguments(final ExtensionContext context) { + public Stream provideArguments(final ParameterDeclarations parameterDeclarations, + final ExtensionContext context) { return Stream.of( Arguments.of(null, null, null, null), Arguments.of("a", null, null, null), diff --git a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/cascades/TypeTest.java b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/cascades/TypeTest.java index 1d3ed6b1e2..db5ace10d4 100644 --- a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/cascades/TypeTest.java +++ b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/cascades/TypeTest.java @@ -55,6 +55,7 @@ import org.junit.jupiter.params.provider.ArgumentsProvider; import org.junit.jupiter.params.provider.ArgumentsSource; import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.support.ParameterDeclarations; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -88,7 +89,8 @@ static class ProtobufRandomMessageProvider implements ArgumentsProvider { private static final Random random = new Random(seed); @Override - public Stream provideArguments(final ExtensionContext context) throws Exception { + public Stream provideArguments(final ParameterDeclarations parameterDeclarations, + final ExtensionContext context) { return Stream.of( Arguments.of( "TestRecords4WrapperProto.RestaurantRecord", TestRecords4WrapperProto.RestaurantRecord.newBuilder() @@ -226,7 +228,8 @@ void recordTypeIsParsable(final String paramTestTitleIgnored, final Message mess static class TypesProvider implements ArgumentsProvider { @Override - public Stream provideArguments(final ExtensionContext context) throws Exception { + public Stream provideArguments(final ParameterDeclarations parameterDeclarations, + final ExtensionContext context) throws Exception { final var listOfNulls = new LinkedList(); listOfNulls.add(null); final var listOfNullsAndNonNulls = new LinkedList(); diff --git a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/cascades/VariadicFunctionValueTest.java b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/cascades/VariadicFunctionValueTest.java index ad0142439b..2f241d4f3b 100644 --- a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/cascades/VariadicFunctionValueTest.java +++ b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/cascades/VariadicFunctionValueTest.java @@ -42,6 +42,7 @@ import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.ArgumentsProvider; import org.junit.jupiter.params.provider.ArgumentsSource; +import org.junit.jupiter.params.support.ParameterDeclarations; import java.util.List; import java.util.Optional; @@ -138,7 +139,8 @@ private static DynamicMessage getMessageForRecordNamed() { static class BinaryPredicateTestProvider implements ArgumentsProvider { @Override - public Stream provideArguments(final ExtensionContext context) { + public Stream provideArguments(final ParameterDeclarations parameterDeclarations, + final ExtensionContext context) { return Stream.of( // Greatest Function Arguments.of(List.of(INT_1, INT_1), new VariadicFunctionValue.GreatestFn(), 1, false), diff --git a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/plans/ExplodePlanTest.java b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/plans/ExplodePlanTest.java index 25870edadb..b4b4676fee 100644 --- a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/plans/ExplodePlanTest.java +++ b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/query/plan/plans/ExplodePlanTest.java @@ -32,6 +32,7 @@ import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.ArgumentsProvider; import org.junit.jupiter.params.provider.ArgumentsSource; +import org.junit.jupiter.params.support.ParameterDeclarations; import javax.annotation.Nonnull; import java.util.List; @@ -118,7 +119,8 @@ private static void verifyCursor(@Nonnull final RecordCursor actual private static class ArgumentProvider implements ArgumentsProvider { @Override - public Stream provideArguments(final ExtensionContext context) { + public Stream provideArguments(final ParameterDeclarations parameterDeclarations, + final ExtensionContext context) { return Stream.of( Arguments.of(ExplodeCursorBuilder.instance().withLimit(1), ImmutableList.of(1), true), Arguments.of(ExplodeCursorBuilder.instance().withLimit(4), ImmutableList.of(1, 2, 3, 4), true), diff --git a/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/api/ddl/SqlFunctionTest.java b/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/api/ddl/SqlFunctionTest.java index b4e14927f5..15fc4e46ca 100644 --- a/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/api/ddl/SqlFunctionTest.java +++ b/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/api/ddl/SqlFunctionTest.java @@ -51,7 +51,7 @@ import static com.apple.foundationdb.relational.matchers.SchemaTemplateMatchers.routine; import static com.apple.foundationdb.relational.utils.RelationalAssertions.assertThrows; import static org.hamcrest.MatcherAssert.assertThat; -import static org.junit.jupiter.params.ParameterizedTest.ARGUMENTS_PLACEHOLDER; +import static org.junit.jupiter.params.ParameterizedInvocationConstants.ARGUMENTS_PLACEHOLDER; /** * Contains a number of tests for creating SQL functions. diff --git a/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/autotest/engine/AutoTestDescriptor.java b/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/autotest/engine/AutoTestDescriptor.java index 027c6c1832..023f3b264e 100644 --- a/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/autotest/engine/AutoTestDescriptor.java +++ b/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/autotest/engine/AutoTestDescriptor.java @@ -85,11 +85,6 @@ class AutoTestDescriptor extends ClassTestDescriptor { this.configInvoker = configInvoker; } - @Override - public Type getType() { - return Type.CONTAINER; - } - @Override public boolean mayRegisterTests() { return true; @@ -114,7 +109,7 @@ private void invokeBeforeEachCallbacks(JupiterEngineExecutionContext context) t MutableExtensionRegistry registry = context.getExtensionRegistry(); ExtensionContext extensionContext = context.getExtensionContext(); ThrowableCollector throwableCollector = context.getThrowableCollector(); - final TestInstances testInstances = context.getTestInstancesProvider().getTestInstances(registry, throwableCollector); + final TestInstances testInstances = context.getTestInstancesProvider().getTestInstances(registry, context); JunitUtils.setTestInstances(extensionContext, testInstances); for (BeforeEachCallback callback : registry.getExtensions(BeforeEachCallback.class)) { @@ -145,7 +140,7 @@ public void cleanUp(JupiterEngineExecutionContext context) throws Exception { private void invokeWorkloadTests(JupiterEngineExecutionContext context, DynamicTestExecutor dynamicTestExecutor, JupiterConfiguration configuration) throws InterruptedException { Object instance = context.getTestInstancesProvider() - .getTestInstances(context.getExtensionRegistry(), context.getThrowableCollector()) + .getTestInstances(context.getExtensionRegistry(), context) .findInstance(getTestClass()) .orElseThrow(); @@ -173,7 +168,7 @@ private void invokeWorkloadTests(JupiterEngineExecutionContext context, DynamicT Collection queries = queryInvoker.getQueries(instance, workload.getSchema(), context, executableInvoker); queries.forEach(querySet -> { UniqueId uid = parentId.append(DYNAMIC_CONTAINER_SEGMENT_TYPE, workload.getDisplayName() + "-" + querySet.getLabel()); - TestDescriptor descriptor = new WorkloadTestDescriptor(uid, getTestClass(), configuration, workload, querySet); + TestDescriptor descriptor = new WorkloadTestDescriptor(uid, getTestClass(), this, configuration, workload, querySet); dynamicTestExecutor.execute(descriptor); }); })); diff --git a/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/autotest/engine/AutoTestEngine.java b/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/autotest/engine/AutoTestEngine.java index 322f3872ed..fae1333e80 100644 --- a/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/autotest/engine/AutoTestEngine.java +++ b/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/autotest/engine/AutoTestEngine.java @@ -21,12 +21,12 @@ package com.apple.foundationdb.relational.autotest.engine; import com.apple.foundationdb.relational.autotest.AutomatedTest; - import org.junit.jupiter.engine.config.CachingJupiterConfiguration; import org.junit.jupiter.engine.config.DefaultJupiterConfiguration; import org.junit.jupiter.engine.config.JupiterConfiguration; import org.junit.jupiter.engine.descriptor.JupiterEngineDescriptor; import org.junit.jupiter.engine.execution.JupiterEngineExecutionContext; +import org.junit.jupiter.engine.execution.LauncherStoreFacade; import org.junit.platform.commons.support.AnnotationSupport; import org.junit.platform.commons.support.ReflectionSupport; import org.junit.platform.engine.EngineDiscoveryRequest; @@ -54,7 +54,8 @@ public String getId() { @Override public TestDescriptor discover(EngineDiscoveryRequest discoveryRequest, UniqueId uniqueId) { JupiterConfiguration config = new CachingJupiterConfiguration( - new DefaultJupiterConfiguration(discoveryRequest.getConfigurationParameters())); + new DefaultJupiterConfiguration(discoveryRequest.getConfigurationParameters(), + discoveryRequest.getOutputDirectoryCreator())); TestDescriptor rootDescriptor = new JupiterEngineDescriptor(uniqueId, config); discoveryRequest.getSelectorsByType(ClasspathRootSelector.class).forEach(selector -> @@ -72,7 +73,8 @@ public TestDescriptor discover(EngineDiscoveryRequest discoveryRequest, UniqueId protected JupiterEngineExecutionContext createExecutionContext(ExecutionRequest request) { JupiterEngineDescriptor engineDescriptor = (JupiterEngineDescriptor) request.getRootTestDescriptor(); JupiterConfiguration config = engineDescriptor.getConfiguration(); - return new JupiterEngineExecutionContext(request.getEngineExecutionListener(), config); + return new JupiterEngineExecutionContext(request.getEngineExecutionListener(), config, + new LauncherStoreFacade(request.getStore())); } private void appendTestsInClass(Class javaClass, TestDescriptor engineDesc, JupiterConfiguration config) { @@ -109,5 +111,4 @@ public void execute(ExecutionRequest request) { new AutomatedTestExecutor().execute(request,root); } */ - } diff --git a/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/autotest/engine/WorkloadTestDescriptor.java b/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/autotest/engine/WorkloadTestDescriptor.java index ce38cce0a5..8a32958b55 100644 --- a/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/autotest/engine/WorkloadTestDescriptor.java +++ b/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/autotest/engine/WorkloadTestDescriptor.java @@ -20,12 +20,12 @@ package com.apple.foundationdb.relational.autotest.engine; -import com.apple.foundationdb.relational.api.Row; -import com.apple.foundationdb.relational.api.StructMetaData; import com.apple.foundationdb.relational.api.RelationalConnection; import com.apple.foundationdb.relational.api.RelationalResultSet; import com.apple.foundationdb.relational.api.RelationalStatement; import com.apple.foundationdb.relational.api.RelationalStruct; +import com.apple.foundationdb.relational.api.Row; +import com.apple.foundationdb.relational.api.StructMetaData; import com.apple.foundationdb.relational.api.exceptions.ErrorCode; import com.apple.foundationdb.relational.api.exceptions.RelationalException; import com.apple.foundationdb.relational.autotest.Connector; @@ -40,7 +40,6 @@ import com.apple.foundationdb.relational.recordlayer.util.ExceptionUtil; import com.apple.foundationdb.relational.utils.ReservoirSample; import com.apple.foundationdb.relational.utils.ResultSetAssert; - import org.junit.jupiter.api.DynamicTest; import org.junit.jupiter.api.extension.ExtensionContext; import org.junit.jupiter.api.extension.TestExecutionExceptionHandler; @@ -82,19 +81,15 @@ class WorkloadTestDescriptor extends NestedClassTestDescriptor { public WorkloadTestDescriptor(UniqueId uniqueId, Class testClass, + TestDescriptor parent, JupiterConfiguration configuration, AutoWorkload workload, QuerySet queries) { - super(uniqueId, testClass, configuration); + super(uniqueId, testClass, () -> NestedClassTestDescriptor.getEnclosingTestClasses(parent), configuration); this.workload = workload; this.querySet = queries; } - @Override - public Type getType() { - return Type.CONTAINER; - } - @Override public boolean mayRegisterTests() { return true; diff --git a/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/recordlayer/PlanGenerationStackTest.java b/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/recordlayer/PlanGenerationStackTest.java index a1de830062..341043d570 100644 --- a/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/recordlayer/PlanGenerationStackTest.java +++ b/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/recordlayer/PlanGenerationStackTest.java @@ -37,6 +37,7 @@ import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.ArgumentsProvider; import org.junit.jupiter.params.provider.ArgumentsSource; +import org.junit.jupiter.params.support.ParameterDeclarations; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -80,7 +81,8 @@ public PlanGenerationStackTest() { static class RandomQueryProvider implements ArgumentsProvider { @Override - public Stream provideArguments(final ExtensionContext context) throws Exception { + public Stream provideArguments(final ParameterDeclarations parameterDeclarations, + final ExtensionContext context) { return Stream.of( Arguments.of(0, "select count(*) from restaurant", null), Arguments.of(1, "select * from restaurant", null), diff --git a/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/recordlayer/query/QueryTypeTests.java b/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/recordlayer/query/QueryTypeTests.java index cb6308bd93..c410c5c714 100644 --- a/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/recordlayer/query/QueryTypeTests.java +++ b/fdb-relational-core/src/test/java/com/apple/foundationdb/relational/recordlayer/query/QueryTypeTests.java @@ -29,6 +29,7 @@ import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.ArgumentsProvider; import org.junit.jupiter.params.provider.ArgumentsSource; +import org.junit.jupiter.params.support.ParameterDeclarations; import javax.annotation.Nonnull; import java.util.stream.Stream; @@ -40,7 +41,8 @@ public class QueryTypeTests { static class QueriesProvider implements ArgumentsProvider { @Override - public Stream provideArguments(final ExtensionContext context) throws Exception { + public Stream provideArguments(final ParameterDeclarations parameterDeclarations, + final ExtensionContext context) { return Stream.of( Arguments.of("select count(*) from restaurant", ParseTreeInfo.QueryType.SELECT), Arguments.of(" select * from restaurant", ParseTreeInfo.QueryType.SELECT), diff --git a/fdb-test-utils/src/main/java/com/apple/test/BooleanArgumentsProvider.java b/fdb-test-utils/src/main/java/com/apple/test/BooleanArgumentsProvider.java index 33e5c853e3..a5524e8348 100644 --- a/fdb-test-utils/src/main/java/com/apple/test/BooleanArgumentsProvider.java +++ b/fdb-test-utils/src/main/java/com/apple/test/BooleanArgumentsProvider.java @@ -24,6 +24,7 @@ import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.ArgumentsProvider; import org.junit.jupiter.params.support.AnnotationConsumer; +import org.junit.jupiter.params.support.ParameterDeclarations; import java.util.Arrays; import java.util.stream.Stream; @@ -42,7 +43,8 @@ public void accept(BooleanSource booleanSource) { } @Override - public Stream provideArguments(ExtensionContext extensionContext) throws Exception { + public Stream provideArguments(final ParameterDeclarations parameterDeclarations, + final ExtensionContext extensionContext) throws Exception { if (names.length == 0) { throw new IllegalStateException("@BooleanSource has an empty list of names"); } diff --git a/fdb-test-utils/src/main/java/com/apple/test/RandomSeedProvider.java b/fdb-test-utils/src/main/java/com/apple/test/RandomSeedProvider.java index e719ad059b..50bafcb790 100644 --- a/fdb-test-utils/src/main/java/com/apple/test/RandomSeedProvider.java +++ b/fdb-test-utils/src/main/java/com/apple/test/RandomSeedProvider.java @@ -24,6 +24,7 @@ import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.ArgumentsProvider; import org.junit.jupiter.params.support.AnnotationConsumer; +import org.junit.jupiter.params.support.ParameterDeclarations; import java.util.stream.Stream; @@ -40,7 +41,8 @@ public void accept(final RandomSeedSource annotation) { } @Override - public Stream provideArguments(final ExtensionContext extensionContext) throws Exception { + public Stream provideArguments(final ParameterDeclarations parameterDeclarations, + final ExtensionContext extensionContext) throws Exception { return RandomizedTestUtils.randomSeeds(fixedSeeds).map(Arguments::of); } } diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index c4e6482b97..f9671f2b0d 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -62,8 +62,8 @@ diffutils = "4.12" hamcrest = "2.2" jcommander = "1.81" jline = "3.30.4" -junit = "5.11.3" -junit-platform = "1.7.1" +junit = "5.14.1" +junit-platform = "1.14.1" mockito = "3.7.7" snakeyaml = "2.2" @@ -145,7 +145,7 @@ spotbugs-annotations = { module = "com.github.spotbugs:spotbugs-annotations", ve [bundles] test-impl = [ "assertj", "hamcrest", "junit-api", "junit-params", "log4j-core", "mockito", "bndtools" ] -test-runtime = [ "junit-engine", "log4j-slf4jBinding"] +test-runtime = [ "junit-engine", "junit-platform", "log4j-slf4jBinding"] test-compileOnly = [ "autoService", "jsr305" ] [plugins] From bde3495045cbb8b8eb226a48032dcce20e45a975 Mon Sep 17 00:00:00 2001 From: Normen Seemann Date: Fri, 12 Dec 2025 18:19:55 +0100 Subject: [PATCH 08/17] hnsw delete testcases work --- .../async/hnsw/DeleteNeighborsChangeSet.java | 4 +- .../apple/foundationdb/async/hnsw/HNSW.java | 541 +++++++++++------- .../async/hnsw/InsertNeighborsChangeSet.java | 5 +- .../foundationdb/async/hnsw/HNSWTest.java | 69 ++- 4 files changed, 375 insertions(+), 244 deletions(-) diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/DeleteNeighborsChangeSet.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/DeleteNeighborsChangeSet.java index 1d6b5ff4a6..194db87eab 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/DeleteNeighborsChangeSet.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/DeleteNeighborsChangeSet.java @@ -132,8 +132,8 @@ public void writeDelta(@Nonnull final InliningStorageAdapter storageAdapter, @No if (tuplePredicate.test(deletedNeighborPrimaryKey)) { storageAdapter.deleteNeighbor(transaction, layer, node.asInliningNode(), deletedNeighborPrimaryKey); if (logger.isTraceEnabled()) { - logger.trace("deleted neighbor of primaryKey={} targeting primaryKey={}", node.getPrimaryKey(), - deletedNeighborPrimaryKey); + logger.trace("deleted neighbor of layer={}, primaryKey={} targeting primaryKey={}", + layer, node.getPrimaryKey(), deletedNeighborPrimaryKey); } } } diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java index 08292cb18e..955b8f8901 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java @@ -269,8 +269,8 @@ private Quantizer quantizer(@Nullable final AccessInfo accessInfo) { entryNodeReference.getVector(), estimator.distance(transformedQueryVector, entryNodeReference.getVector())); - final int entryLayer = entryNodeReference.getLayer(); - return forLoop(entryLayer, entryState, + final int topLayer = entryNodeReference.getLayer(); + return forLoop(topLayer, entryState, layer -> layer > 0, layer -> layer - 1, (layer, previousNodeReference) -> { @@ -379,64 +379,85 @@ private Quantizer quantizer(@Nullable final AccessInfo accessInfo) { @Nonnull final NodeReferenceWithDistance nodeReferenceWithDistance, final int layer, @Nonnull final Transformed queryVector) { -// if (storageAdapter.isInliningStorageAdapter()) { -// return greedySearchInliningLayer(storageAdapter.asInliningStorageAdapter(), readTransaction, -// storageTransform, estimator, nodeReferenceWithDistance, layer, queryVector); -// } else { - return searchLayer(storageAdapter, readTransaction, storageTransform, estimator, - ImmutableList.of(nodeReferenceWithDistance), layer, 1, Maps.newConcurrentMap(), queryVector) - .thenApply(searchResult -> - Iterables.getOnlyElement(searchResult).getNodeReference()); -// } + if (storageAdapter.isInliningStorageAdapter()) { + return greedySearchInliningLayer(storageAdapter.asInliningStorageAdapter(), readTransaction, + storageTransform, estimator, nodeReferenceWithDistance, layer, queryVector); + } else { + return searchLayer(storageAdapter, readTransaction, storageTransform, estimator, + ImmutableList.of(nodeReferenceWithDistance), layer, 1, Maps.newConcurrentMap(), queryVector) + .thenApply(searchResult -> + Iterables.getOnlyElement(searchResult).getNodeReference()); + } } -// @Nonnull -// private CompletableFuture greedySearchInliningLayer(@Nonnull final InliningStorageAdapter storageAdapter, -// @Nonnull final ReadTransaction readTransaction, -// @Nonnull final StorageTransform storageTransform, -// @Nonnull final Estimator estimator, -// @Nonnull final NodeReferenceWithDistance nodeReferenceWithDistance, -// final int layer, -// @Nonnull final Transformed queryVector) { -// final AtomicReference currentNodeReferenceAtomic = -// new AtomicReference<>(nodeReferenceWithDistance); -// -// return AsyncUtil.whileTrue(() -> onReadListener.onAsyncRead( -// storageAdapter.fetchNode(readTransaction, storageTransform, layer, currentNodeReferenceAtomic.get().getPrimaryKey())) -// .thenApply(node -> { -// if (node == null) { -// // -// // This cannot happen under normal circumstances as the storage adapter returns a node with no -// // neighbors if it already has been deleted. Therefore, it is correct to throw here. -// // -// throw new IllegalStateException("unable to fetch node"); -// } -// final InliningNode inliningNode = node.asInliningNode(); -// final List neighbors = inliningNode.getNeighbors(); -// -// final NodeReferenceWithDistance currentNodeReference = currentNodeReferenceAtomic.get(); -// double minDistance = currentNodeReference.getDistance(); -// -// NodeReferenceWithVector nearestNeighbor = null; -// for (final NodeReferenceWithVector neighbor : neighbors) { -// final double distance = -// estimator.distance(neighbor.getVector(), queryVector); -// if (distance < minDistance) { -// minDistance = distance; -// nearestNeighbor = neighbor; -// } -// } -// -// if (nearestNeighbor == null) { -// return false; -// } -// -// currentNodeReferenceAtomic.set( -// new NodeReferenceWithDistance(nearestNeighbor.getPrimaryKey(), nearestNeighbor.getVector(), -// minDistance)); -// return true; -// }), executor).thenApply(ignored -> currentNodeReferenceAtomic.get()); -// } + @Nonnull + private CompletableFuture greedySearchInliningLayer(@Nonnull final InliningStorageAdapter storageAdapter, + @Nonnull final ReadTransaction readTransaction, + @Nonnull final StorageTransform storageTransform, + @Nonnull final Estimator estimator, + @Nonnull final NodeReferenceWithDistance nodeReferenceWithDistance, + final int layer, + @Nonnull final Transformed queryVector) { + final AtomicReference nearestNodeReferenceAtomic = + new AtomicReference<>(null); + + final Queue candidates = + // This initial capacity is somewhat arbitrary as m is not necessarily a limit, + // but it gives us a number that is better than the default. + new PriorityQueue<>(config.getM(), + Comparator.comparing(NodeReferenceWithDistance::getDistance)); + candidates.add(nodeReferenceWithDistance); + + return AsyncUtil.whileTrue(() -> onReadListener.onAsyncRead( + storageAdapter.fetchNode(readTransaction, storageTransform, layer, + Objects.requireNonNull(candidates.peek()).getPrimaryKey())) + .thenCompose(node -> { + if (node == null) { + // + // This cannot happen under normal circumstances as the storage adapter returns a node with no + // neighbors if it already has been deleted. Therefore, it is correct to throw here. + // + throw new IllegalStateException("unable to fetch node"); + } + final InliningNode candidateNode = node.asInliningNode(); + final List neighbors = candidateNode.getNeighbors(); + + if (neighbors.isEmpty()) { + // If there are no neighbors, we either really have no neighbor on this level anymore and the + // node does exist (on layer 0), or not. + return exists(readTransaction, node.getPrimaryKey()) + .thenApply(nodeExists -> nodeExists ? candidateNode : null); + } else { + return CompletableFuture.completedFuture(candidateNode); + } + }) + .thenApply(candidateNode -> { + final NodeReferenceWithDistance candidateReference = Objects.requireNonNull(candidates.poll()); + if (candidateNode != null) { + // + // This node definitely does exist. And it's the nearest one. + // + nearestNodeReferenceAtomic.set(candidateReference); + candidates.clear(); + + // + // Find some new candidates. + // + double minDistance = candidateReference.getDistance(); + + for (final NodeReferenceWithVector neighbor : candidateNode.getNeighbors()) { + final double distance = + estimator.distance(neighbor.getVector(), queryVector); + if (distance < minDistance) { + candidates.add( + new NodeReferenceWithDistance(neighbor.getPrimaryKey(), neighbor.getVector(), + distance)); + } + } + } + return !candidates.isEmpty(); + }), executor).thenApply(ignored -> nearestNodeReferenceAtomic.get()); + } /** * Searches a single layer of the graph to find the nearest neighbors to a query vector. @@ -932,6 +953,38 @@ public CompletableFuture insert(@Nonnull final Transaction transaction, @N }).thenCompose(ignored -> AsyncUtil.DONE); } + @Nonnull + private CompletableFuture>> + filterExisting(@Nonnull final StorageAdapter storageAdapter, + @Nonnull final ReadTransaction readTransaction, + @Nonnull final Iterable> nodeReferenceAndNodes) { + if (!storageAdapter.isInliningStorageAdapter()) { + return CompletableFuture.completedFuture(ImmutableList.copyOf(nodeReferenceAndNodes)); + } + + return forEach(nodeReferenceAndNodes, + nodeReferenceAndNode -> { + if (nodeReferenceAndNode.getNode().getNeighbors().isEmpty()) { + return exists(readTransaction, nodeReferenceAndNode.getNodeReference().getPrimaryKey()) + .thenApply(nodeExists -> nodeExists ? nodeReferenceAndNode : null); + } else { + // this node has neighbors -- it must exist + return CompletableFuture.completedFuture(nodeReferenceAndNode); + } + }, + getConfig().getMaxNumConcurrentNodeFetches(), + getExecutor()) + .thenApply(results -> { + final ImmutableList.Builder> filteredListBuilder = ImmutableList.builder(); + for (final NodeReferenceAndNode result : results) { + if (result != null) { + filteredListBuilder.add(result); + } + } + return filteredListBuilder.build(); + }); + } + @Nonnull @VisibleForTesting CompletableFuture exists(@Nonnull final ReadTransaction readTransaction, @@ -1482,7 +1535,7 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) if (isExtendCandidates) { return neighborReferences(storageAdapter, readTransaction, storageTransform, null, candidates, - CandidateSamplingPredicate.tautology(), layer, nodeCache) + CandidatePredicate.tautology(), layer, nodeCache) .thenApply(neighborsOfCandidates -> { for (final NodeReferenceWithVector nodeReferenceWithVector : neighborsOfCandidates) { final double distance = estimator.distance(nodeReferenceWithVector.getVector(), vector); @@ -1526,14 +1579,16 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) @Nonnull final AffineOperator storageTransform, @Nonnull final SplittableRandom random, @Nonnull final Collection> initialNodeReferenceAndNodes, - @Nonnull final CandidateSamplingPredicate samplingPredicate, + @Nonnull final CandidatePredicate samplingPredicate, final int layer, @Nonnull final Map> nodeCache) { return neighborReferences(storageAdapter, readTransaction, storageTransform, random, initialNodeReferenceAndNodes, samplingPredicate, layer, nodeCache) .thenCompose(neighbors -> fetchSomeNodesIfNotCached(storageAdapter, readTransaction, storageTransform, layer, - neighbors, nodeCache)); + neighbors, nodeCache)) + .thenCompose(neighbors -> + filterExisting(storageAdapter, readTransaction, neighbors)); } /** @@ -1561,7 +1616,7 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) @Nonnull final AffineOperator storageTransform, @Nullable final SplittableRandom random, @Nonnull final Collection> initialNodeReferenceAndNodes, - @Nonnull final CandidateSamplingPredicate samplingPredicate, + @Nonnull final CandidatePredicate samplingPredicate, final int layer, @Nonnull final Map> nodeCache) { final Iterable toBeFetched = @@ -1585,12 +1640,12 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) private Set resolveNeighborReferences(@Nonnull final Collection> initialNodeReferenceAndNodes, @Nullable final SplittableRandom random, - @Nonnull final CandidateSamplingPredicate samplingPredicate) { - final ImmutableSet.Builder resultBuilder = ImmutableSet.builder(); + @Nonnull final CandidatePredicate candidatePredicate) { + final Set neighborReferences = Sets.newHashSet(); final ImmutableMap.Builder> initialNodesMapBuilder = ImmutableMap.builder(); for (final NodeReferenceAndNode nodeReferenceAndNode : initialNodeReferenceAndNodes) { initialNodesMapBuilder.put(nodeReferenceAndNode.getNode().getPrimaryKey(), nodeReferenceAndNode); - resultBuilder.add(nodeReferenceAndNode.getNodeReference()); + neighborReferences.add(nodeReferenceAndNode.getNodeReference()); } final ImmutableMap> initialNodesMap = initialNodesMapBuilder.build(); @@ -1600,11 +1655,6 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) for (final N neighbor : nodeReferenceAndNode.getNode().getNeighbors()) { final Tuple neighborPrimaryKey = neighbor.getPrimaryKey(); - if (!samplingPredicate.test(random, - initialNodeReferenceAndNodes.size(), neighbor)) { - continue; - } - // // We need to distinguish between initial node references and non-initial node references: // Initial nodes references are of type T (and sometimes already contain a vector in which case @@ -1619,11 +1669,21 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) // This is a node that is currently not known to us. It is not an initial node. We need to fetch it, // and we need to mark it as seen so we won't consider it more than once. // - resultBuilder.add(neighbor); + neighborReferences.add(neighbor); nodeReferencesSeen.add(neighborPrimaryKey); } } } + + // sample down the set of neighbors + final ImmutableSet.Builder resultBuilder = ImmutableSet.builder(); + for (final NodeReference neighborReference : neighborReferences) { + if (candidatePredicate.test(random, initialNodesMap.keySet(), + neighborReferences.size(), neighborReference)) { + resultBuilder.add(neighborReference); + } + } + return resultBuilder.build(); } @@ -1704,19 +1764,19 @@ public CompletableFuture delete(@Nonnull final Transaction transaction, @N final SplittableRandom random = random(primaryKey); final int topLayer = topLayer(primaryKey); if (logger.isTraceEnabled()) { - logger.trace("new node with key={} to be deleted form layer={}", primaryKey, topLayer); + logger.trace("node with key={} to be deleted form layer={}", primaryKey, topLayer); } return StorageAdapter.fetchAccessInfo(getConfig(), transaction, getSubspace(), getOnReadListener()) .thenCombine(exists(transaction, primaryKey), - (accessInfo, nodeAlreadyExists) -> { - if (!nodeAlreadyExists) { - if (logger.isDebugEnabled()) { - logger.debug("record does not exists in HNSW with key={} on layer={}", + (accessInfo, nodeExists) -> { + if (!nodeExists) { + if (logger.isTraceEnabled()) { + logger.trace("record does not exists in HNSW with key={} on layer={}", primaryKey, topLayer); } } - return new AccessInfoAndNodeExistence(accessInfo, nodeAlreadyExists); + return new AccessInfoAndNodeExistence(accessInfo, nodeExists); }) .thenCompose(accessInfoAndNodeExistence -> { if (!accessInfoAndNodeExistence.isNodeExists()) { @@ -1773,10 +1833,6 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi @Nonnull final SplittableRandom random, @Nonnull final Tuple primaryKey, final int topLayer) { - if (logger.isTraceEnabled()) { - logger.trace("nearest entry point for deleteFromLayers at topLayer={} is at key={}", topLayer, primaryKey); - } - return MoreAsyncUtil.forEach(() -> IntStream.rangeClosed(0, topLayer).iterator(), layer -> { final StorageAdapter storageAdapter = getStorageAdapterForLayer(layer); @@ -1826,46 +1882,32 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi final NodeReferenceAndNode toBeDeletedNodeReferenceAndNode = new NodeReferenceAndNode<>(new NodeReference(toBeDeletedPrimaryKey), toBeDeletedNode); - return candidatesForRepairs(storageAdapter, transaction, storageTransform, random, layer, + return findCandidates(storageAdapter, transaction, storageTransform, random, layer, toBeDeletedNodeReferenceAndNode, nodeCache) - .thenApply(candidates -> { - for (final NodeReferenceAndNode candidate : candidates) { - final AbstractNode neighbors = candidate.getNode(); - for (final N neighborOfCandidate : neighbors.getNeighbors()) { - if (neighborOfCandidate.getPrimaryKey().equals(toBeDeletedPrimaryKey)) { - // - // Make sure the neighbor pointing to the node-to-be-deleted is deleted as - // well. - // - candidateChangeSetMap.put(neighbors.getPrimaryKey(), - new DeleteNeighborsChangeSet<>( - new BaseNeighborsChangeSet<>(neighbors.getNeighbors()), - ImmutableList.of(toBeDeletedPrimaryKey))); - break; - } - } - } - return candidates; - }) - .thenCompose(candidates -> - forEach(toBeDeletedNode.getNeighbors(), // for each direct neighbor - neighborReference -> - prepareCandidatesAndRepairNeighbor(storageAdapter, transaction, - storageTransform, quantizer, layer, toBeDeletedPrimaryKey, - neighborReference, candidates, candidateChangeSetMap, - nodeCache), - getConfig().getMaxNumConcurrentNeighborhoodFetches(), executor) - .thenApply(ignored -> { - final ImmutableMap.Builder candidateReferencesMapBuilder = - ImmutableMap.builder(); - for (final NodeReferenceAndNode candidate : candidates) { - final var candidatePrimaryKey = candidate.getNodeReference().getPrimaryKey(); - if (candidateChangeSetMap.containsKey(candidatePrimaryKey)) { - candidateReferencesMapBuilder.put(candidatePrimaryKey, candidate.getNodeReference()); - } + .thenCompose(candidates -> { + initializeCandidateChangeSetMap(toBeDeletedPrimaryKey, toBeDeletedNode, candidates, + candidateChangeSetMap); + final ImmutableList primaryNeighbors = + primaryNeighbors(toBeDeletedNode, candidateChangeSetMap); + + return forEach(primaryNeighbors, + neighborReference -> + repairNeighbor(storageAdapter, transaction, + storageTransform, quantizer, layer, neighborReference, + candidates, candidateChangeSetMap, nodeCache), + getConfig().getMaxNumConcurrentNeighborhoodFetches(), executor) + .thenApply(ignored -> { + final ImmutableMap.Builder candidateReferencesMapBuilder = + ImmutableMap.builder(); + for (final NodeReferenceAndNode candidate : candidates) { + final var candidatePrimaryKey = candidate.getNodeReference().getPrimaryKey(); + if (candidateChangeSetMap.containsKey(candidatePrimaryKey)) { + candidateReferencesMapBuilder.put(candidatePrimaryKey, candidate.getNodeReference()); } - return candidateReferencesMapBuilder.build(); - })) + } + return candidateReferencesMapBuilder.build(); + }); + }) .thenCompose(candidateReferencesMap -> { final int currentMMax = layer == 0 ? getConfig().getMMax0() : getConfig().getMMax(); @@ -1906,8 +1948,7 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi // // Return the first item in the candidates reference map as a potential new // entry node reference in order to avoid a costly search for a new global entry point. - // This reference may not exist in a sparse HNSW but that case should be exceedingly - // rare. + // This reference is guaranteed to exist. // final Tuple firstPrimaryKey = Iterables.getFirst(candidateReferencesMap.keySet(), null); @@ -1926,83 +1967,120 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi } @Nonnull - private CompletableFuture>> - candidatesForRepairs(final @Nonnull StorageAdapter storageAdapter, - final @Nonnull Transaction transaction, - final @Nonnull AffineOperator storageTransform, - final @Nonnull SplittableRandom random, - final int layer, - final NodeReferenceAndNode toBeDeletedNodeReferenceAndNode, - final Map> nodeCache) { + private ImmutableList + primaryNeighbors(@Nonnull final AbstractNode toBeDeletedNode, + @Nonnull final Map> candidateChangeSetMap) { + // + // All candidates are definitely existing and the candidates hold all existing primary + // candidates. + // + final ImmutableList.Builder primaryNeighborsBuilder = ImmutableList.builder(); + for (N potentialPrimaryNeighbor : toBeDeletedNode.getNeighbors()) { + if (candidateChangeSetMap.containsKey(potentialPrimaryNeighbor.getPrimaryKey())) { + primaryNeighborsBuilder.add(potentialPrimaryNeighbor); + } + } + return primaryNeighborsBuilder.build(); + } + + private + void initializeCandidateChangeSetMap(@Nonnull final Tuple toBeDeletedPrimaryKey, + @Nonnull final AbstractNode toBeDeletedNode, + @Nonnull final List> candidates, + @Nonnull final Map> candidateChangeSetMap) { + for (final NodeReferenceAndNode candidate : candidates) { + final AbstractNode candidateNode = candidate.getNode(); + boolean foundToBeDeleted = false; + for (final N neighborOfCandidate : candidateNode.getNeighbors()) { + if (neighborOfCandidate.getPrimaryKey().equals(toBeDeletedPrimaryKey)) { + // + // Make sure the neighbor pointing to the node-to-be-deleted is deleted as + // well. + // + candidateChangeSetMap.put(candidateNode.getPrimaryKey(), + new DeleteNeighborsChangeSet<>( + new BaseNeighborsChangeSet<>(candidateNode.getNeighbors()), + ImmutableList.of(toBeDeletedPrimaryKey))); + foundToBeDeleted = true; + break; + } + } + if (!foundToBeDeleted) { + candidateChangeSetMap.put(candidateNode.getPrimaryKey(), + new BaseNeighborsChangeSet<>(candidateNode.getNeighbors())); + } + } + if (logger.isTraceEnabled()) { + logger.trace("number of neighbors to repair={}", toBeDeletedNode.getNeighbors().size()); + } + } + + @Nonnull + private CompletableFuture>> + findCandidates(final @Nonnull StorageAdapter storageAdapter, + final @Nonnull Transaction transaction, + final @Nonnull AffineOperator storageTransform, + final @Nonnull SplittableRandom random, + final int layer, + final NodeReferenceAndNode toBeDeletedNodeReferenceAndNode, + final Map> nodeCache) { return neighbors(storageAdapter, transaction, storageTransform, random, ImmutableList.of(toBeDeletedNodeReferenceAndNode), - CandidateSamplingPredicate.tautology(), layer, nodeCache) + ((r, initialNodeKeys, size, nodeReference) -> + usePrimaryCandidateForRepair(nodeReference, + toBeDeletedNodeReferenceAndNode.getNodeReference().getPrimaryKey())), layer, nodeCache) .thenCompose(candidates -> neighbors(storageAdapter, transaction, storageTransform, random, candidates, - this::shouldSampleCandidate, layer, nodeCache)) - .thenApply(candidates -> { - final ImmutableList.Builder> filteredCandidatesBuilder = - ImmutableList.builder(); - for (final NodeReferenceAndNode candidate : candidates) { - // filter out neighbors that happen to be the node we are trying to delete - if (!candidate.getNodeReference().getPrimaryKey() - .equals(toBeDeletedNodeReferenceAndNode.getNode().getPrimaryKey())) { - filteredCandidatesBuilder.add(candidate); - } - } - return filteredCandidatesBuilder.build(); - }) + ((r, initialNodeKeys, size, nodeReference) -> + useSecondaryCandidateForRepair(r, initialNodeKeys, size, nodeReference, + toBeDeletedNodeReferenceAndNode.getNodeReference().getPrimaryKey())), + layer, nodeCache)) .thenApply(candidates -> { if (logger.isTraceEnabled()) { final ImmutableList.Builder candidateStringsBuilder = ImmutableList.builder(); for (final NodeReferenceAndNode candidate : candidates) { candidateStringsBuilder.add(candidate.getNode().getPrimaryKey().toString()); } - logger.trace("resolved candidates={}", String.join(",", - candidateStringsBuilder.build())); + logger.trace("resolved at layer={} num={} candidates={}", layer, candidates.size(), + String.join(",", candidateStringsBuilder.build())); } return candidates; }); } private @Nonnull CompletableFuture - prepareCandidatesAndRepairNeighbor(@Nonnull final StorageAdapter storageAdapter, - @Nonnull final Transaction transaction, - @Nonnull final AffineOperator storageTransform, - @Nonnull final Quantizer quantizer, - final int layer, - @Nonnull final Tuple toBeDeletedPrimaryKey, - @Nonnull final N neighborReference, - @Nonnull final Collection> sampledCandidates, - @Nonnull final Map> neighborChangeSetMap, - @Nonnull final Map> nodeCache) { + repairNeighbor(@Nonnull final StorageAdapter storageAdapter, + @Nonnull final Transaction transaction, + @Nonnull final AffineOperator storageTransform, + @Nonnull final Quantizer quantizer, + final int layer, + @Nonnull final N neighborReference, + @Nonnull final Collection> sampledCandidates, + @Nonnull final Map> neighborChangeSetMap, + @Nonnull final Map> nodeCache) { final Estimator estimator = quantizer.estimator(); return fetchNodeIfNotCached(storageAdapter, transaction, storageTransform, layer, neighborReference, nodeCache) .thenCompose(neighborNode -> { - if (neighborNode == null) { - // node could not be fetched; maybe it was deleted already -> ignore - return AsyncUtil.DONE; - } final ImmutableList.Builder candidatesReferencesBuilder = ImmutableList.builder(); final Transformed neighborVector = storageAdapter.getVector(neighborReference, neighborNode); + // transform the NodeReferencesWithVectors into NodeReferencesWithDistance for (final NodeReferenceAndNode candidate : sampledCandidates) { // do not add the candidate if that candidate is in fact the neighbor itself if (!candidate.getNodeReference().getPrimaryKey().equals(neighborReference.getPrimaryKey())) { final Transformed candidateVector = candidate.getNodeReference().getVector(); final double distance = - estimator.distance(candidateVector, - neighborVector); - candidatesReferencesBuilder.add(new NodeReferenceWithDistance(candidate.getNode().getPrimaryKey(), - candidateVector, distance)); + estimator.distance(candidateVector, neighborVector); + candidatesReferencesBuilder.add(new NodeReferenceWithDistance( + candidate.getNode().getPrimaryKey(), candidateVector, distance)); } } return repairInsForNeighborNode(storageAdapter, transaction, storageTransform, estimator, - layer, toBeDeletedPrimaryKey, neighborReference, candidatesReferencesBuilder.build(), + layer, neighborReference, candidatesReferencesBuilder.build(), neighborChangeSetMap, nodeCache); }); } @@ -2013,7 +2091,6 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi @Nonnull final AffineOperator storageTransform, @Nonnull final Estimator estimator, final int layer, - @Nonnull final Tuple toBeDeletedPrimaryKey, @Nonnull final N neighborReference, @Nonnull final Iterable candidates, @Nonnull final Map> neighborChangeSetMap, @@ -2037,62 +2114,15 @@ layer, getConfig().getM(), nodeCache) for (final NodeReferenceAndNode selectedCandidate : selectedCandidates) { neighborChangeSetMap.compute(selectedCandidate.getNode().getPrimaryKey(), (ignored, oldChangeSet) -> { - final NeighborsChangeSet baseSet; - if (oldChangeSet == null) { - baseSet = - // delete the primary key of the record we are trying to delete - new DeleteNeighborsChangeSet<>( - new BaseNeighborsChangeSet<>(selectedCandidate.getNode().getNeighbors()), - ImmutableList.of(toBeDeletedPrimaryKey)); - } else { - baseSet = oldChangeSet; - } + Objects.requireNonNull(oldChangeSet); // insert a reference to the neighbor - return new InsertNeighborsChangeSet<>(baseSet, ImmutableList.of(neighborReference)); + return new InsertNeighborsChangeSet<>(oldChangeSet, ImmutableList.of(neighborReference)); }); } return AsyncUtil.DONE; }); } - /** - * Scans all nodes within a given layer of the database. - *

- * The scan is performed transactionally in batches to avoid loading the entire layer into memory at once. Each - * discovered node is passed to the provided {@link Consumer} for processing. The operation continues fetching - * batches until all nodes in the specified layer have been processed. - * - * @param db the non-null {@link Database} instance to run the scan against. - * @param layer the specific layer index to scan. - * @param batchSize the number of nodes to retrieve and process in each batch. - * @param nodeConsumer the non-null {@link Consumer} that will accept each {@link AbstractNode} - * found in the layer. - */ - @VisibleForTesting - static void scanLayer(@Nonnull final Config config, - @Nonnull final Subspace subspace, - @Nonnull final Database db, - final int layer, - final int batchSize, - @Nonnull final Consumer> nodeConsumer) { - final StorageAdapter storageAdapter = - storageAdapterForLayer(config, subspace, OnWriteListener.NOOP, OnReadListener.NOOP, layer); - final AtomicReference lastPrimaryKeyAtomic = new AtomicReference<>(); - Tuple newPrimaryKey; - do { - final Tuple lastPrimaryKey = lastPrimaryKeyAtomic.get(); - lastPrimaryKeyAtomic.set(null); - newPrimaryKey = db.run(tr -> { - Streams.stream(storageAdapter.scanLayer(tr, layer, lastPrimaryKey, batchSize)) - .forEach(node -> { - nodeConsumer.accept(Objects.requireNonNull(node)); - lastPrimaryKeyAtomic.set(node.getPrimaryKey()); - }); - return lastPrimaryKeyAtomic.get(); - }); - } while (newPrimaryKey != null); - } - /** * Gets the appropriate storage adapter for a given layer. *

@@ -2131,8 +2161,41 @@ private int topLayer(@Nonnull final Tuple primaryKey) { return (int) Math.floor(-Math.log(u) * lambda); } - @SuppressWarnings("unused") - private boolean shouldSampleCandidate(@Nullable final SplittableRandom random, final int numberOfCandidates, NodeReference nodeReference) { + private boolean usePrimaryCandidateForRepair(@Nonnull final NodeReference candidateReference, + @Nonnull final Tuple toBeDeletedPrimaryKey) { + final Tuple candidatePrimaryKey = candidateReference.getPrimaryKey(); + + // + // If the node reference is the record we are trying to delete we must reject it here as it is not a suitable + // candidate. + // + return !candidatePrimaryKey.equals(toBeDeletedPrimaryKey); + } + + private boolean useSecondaryCandidateForRepair(@Nullable final SplittableRandom random, + @Nonnull final Set initialNodeKeys, + final int numberOfCandidates, + @Nonnull final NodeReference candidateReference, + @Nonnull final Tuple toBeDeletedPrimaryKey) { + final Tuple candidatePrimaryKey = candidateReference.getPrimaryKey(); + + // + // If the node reference is the record we are trying to delete we must reject it here as it is not a suitable + // candidate. + // + if (candidatePrimaryKey.equals(toBeDeletedPrimaryKey)) { + return false; + } + + // + // If the node reference is among the initial nodes we must accept it as they are very likely the best + // candidates. + // + if (initialNodeKeys.contains(candidatePrimaryKey)) { + return true; + } + + // sample all the rest final double sampleRate = (double)getConfig().getM() / numberOfCandidates; if (sampleRate >= 1) { return true; @@ -2148,6 +2211,44 @@ private boolean shouldMaintainStats(@Nonnull final SplittableRandom random) { return random.nextDouble() < getConfig().getMaintainStatsProbability(); } + /** + * Scans all nodes within a given layer of the database. + *

+ * The scan is performed transactionally in batches to avoid loading the entire layer into memory at once. Each + * discovered node is passed to the provided {@link Consumer} for processing. The operation continues fetching + * batches until all nodes in the specified layer have been processed. + * + * @param db the non-null {@link Database} instance to run the scan against. + * @param layer the specific layer index to scan. + * @param batchSize the number of nodes to retrieve and process in each batch. + * @param nodeConsumer the non-null {@link Consumer} that will accept each {@link AbstractNode} + * found in the layer. + */ + @VisibleForTesting + static void scanLayer(@Nonnull final Config config, + @Nonnull final Subspace subspace, + @Nonnull final Database db, + final int layer, + final int batchSize, + @Nonnull final Consumer> nodeConsumer) { + final StorageAdapter storageAdapter = + storageAdapterForLayer(config, subspace, OnWriteListener.NOOP, OnReadListener.NOOP, layer); + final AtomicReference lastPrimaryKeyAtomic = new AtomicReference<>(); + Tuple newPrimaryKey; + do { + final Tuple lastPrimaryKey = lastPrimaryKeyAtomic.get(); + lastPrimaryKeyAtomic.set(null); + newPrimaryKey = db.run(tr -> { + Streams.stream(storageAdapter.scanLayer(tr, layer, lastPrimaryKey, batchSize)) + .forEach(node -> { + nodeConsumer.accept(Objects.requireNonNull(node)); + lastPrimaryKeyAtomic.set(node.getPrimaryKey()); + }); + return lastPrimaryKeyAtomic.get(); + }); + } while (newPrimaryKey != null); + } + /** * Gets the appropriate storage adapter for a given layer. *

@@ -2197,13 +2298,13 @@ private static List drain(@Nonnull Queue queue) { } @FunctionalInterface - private interface CandidateSamplingPredicate { + private interface CandidatePredicate { @Nonnull - static CandidateSamplingPredicate tautology() { - return (random, size, nodeReference) -> true; + static CandidatePredicate tautology() { + return (random, initialNodeKeys, size, nodeReference) -> true; } - boolean test(@Nullable SplittableRandom random, int size, NodeReference nodeReference); + boolean test(@Nullable SplittableRandom random, @Nonnull Set initialNodeKeys, int size, NodeReference nodeReference); } private static class AccessInfoAndNodeExistence { diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InsertNeighborsChangeSet.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InsertNeighborsChangeSet.java index b3b5ef8a12..b6bfe0d093 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InsertNeighborsChangeSet.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InsertNeighborsChangeSet.java @@ -31,6 +31,7 @@ import javax.annotation.Nonnull; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.function.Predicate; /** @@ -95,7 +96,9 @@ public NeighborsChangeSet getParent() { @Nonnull @Override public Iterable merge() { - return Iterables.concat(getParent().merge(), insertedNeighborsMap.values()); + return Iterables.concat(Iterables.filter(getParent().merge(), + current -> !insertedNeighborsMap.containsKey(Objects.requireNonNull(current).getPrimaryKey())), + insertedNeighborsMap.values()); } /** diff --git a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java index 9b8be31711..f0b936bfc2 100644 --- a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java +++ b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java @@ -52,7 +52,6 @@ import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.AfterTestExecutionCallback; -import org.junit.jupiter.api.extension.ExtendWith; import org.junit.jupiter.api.extension.ExtensionContext; import org.junit.jupiter.api.extension.RegisterExtension; import org.junit.jupiter.api.parallel.Execution; @@ -206,9 +205,9 @@ void testInliningSerialization(final long seed) { static Stream randomSeedsWithConfig() { return RandomizedTestUtils.randomSeeds(0xdeadc0deL) .flatMap(seed -> Sets.cartesianProduct(ImmutableSet.of(true, false), - ImmutableSet.of(true, false), - ImmutableSet.of(true, false), - ImmutableSet.of(true, false)).stream() + ImmutableSet.of(false, true), + ImmutableSet.of(false, true), + ImmutableSet.of(false, true)).stream() .map(arguments -> Arguments.of(ObjectArrays.concat(seed, new Object[] {HNSW.newConfigBuilder() .setMetric(Metric.EUCLIDEAN_METRIC) @@ -220,7 +219,7 @@ static Stream randomSeedsWithConfig() { .setSampleVectorStatsProbability(1.0d) .setMaintainStatsProbability(0.1d) .setStatsThreshold(100) - .setM(32) + .setM(16) .setMMax(32) .setMMax0(64) .build(128)})))); @@ -317,16 +316,17 @@ void testBasicInsert(final long seed, final Config config) { Assertions.assertThat(readIds.size()).isBetween(10, 50); } - @ExtendWith(HNSWTest.DumpLayersIfFailure.class) + //@ExtendWith(HNSWTest.DumpLayersIfFailure.class) @ParameterizedTest @MethodSource("randomSeedsWithConfig") void testBasicInsertDelete(final long seed, final Config config) { final Random random = new Random(seed); final int size = 1000; + final TestOnWriteListener onWriteListener = new TestOnWriteListener(); final TestOnReadListener onReadListener = new TestOnReadListener(); final HNSW hnsw = new HNSW(rtSubspace.getSubspace(), TestExecutors.defaultThreadPool(), config, - OnWriteListener.NOOP, onReadListener); + onWriteListener, onReadListener); final int k = 50; final List insertedData = randomVectors(random, config.getNumDimensions(), 1000); @@ -342,9 +342,10 @@ void testBasicInsertDelete(final long seed, final Config config) { final List toBeDeleted = pickRandomVectors(random, remainingData, numVectorsPerDeleteBatch); + onWriteListener.reset(); onReadListener.reset(); - long beginTs = System.nanoTime(); + final long beginTs = System.nanoTime(); db.run(tr -> { for (final PrimaryKeyAndVector primaryKeyAndVector : toBeDeleted) { hnsw.delete(tr, primaryKeyAndVector.getPrimaryKey()).join(); @@ -353,12 +354,21 @@ void testBasicInsertDelete(final long seed, final Config config) { }); long endTs = System.nanoTime(); + Assertions.assertThat(onWriteListener.getDeleteCountByLayer().get(0)).isEqualTo(toBeDeleted.size()); + logger.info("delete transaction of {} records after {} records took elapsedTime={}ms; read nodes={}, read bytes={}", numVectorsPerDeleteBatch, size - remainingData.size(), TimeUnit.NANOSECONDS.toMillis(endTs - beginTs), onReadListener.getNodeCountByLayer(), onReadListener.getBytesReadByLayer()); + db.run(tr -> { + for (final PrimaryKeyAndVector primaryKeyAndVector : toBeDeleted) { + hnsw.delete(tr, primaryKeyAndVector.getPrimaryKey()).join(); + } + return null; + }); + final Set deletedSet = toBeDeleted.stream().collect(ImmutableSet.toImmutableSet()); remainingData = remainingData.stream() .filter(vector -> !deletedSet.contains(vector)) @@ -374,11 +384,11 @@ void testBasicInsertDelete(final long seed, final Config config) { onReadListener.reset(); - beginTs = System.nanoTime(); + final long beginTsQuery = System.nanoTime(); final List results = db.run(tr -> hnsw.kNearestNeighborsSearch(tr, k, 100, true, queryVector).join()); - endTs = System.nanoTime(); + final long endTsQuery = System.nanoTime(); int recallCount = 0; for (ResultEntry resultEntry : results) { @@ -388,20 +398,17 @@ void testBasicInsertDelete(final long seed, final Config config) { } final double recall = (double)recallCount / (double)trueNN.size(); -// if (recall == 0.7) { -// int layer = 0; -// while (true) { -// if (!dumpLayer(hnsw, "debug", layer++)) { -// break; -// } -// } -// } - logger.info("search transaction after delete of {} records took elapsedTime={}ms; read nodes={}, read bytes={}, recall={}", size - remainingData.size(), - TimeUnit.NANOSECONDS.toMillis(endTs - beginTs), + TimeUnit.NANOSECONDS.toMillis(endTsQuery - beginTsQuery), onReadListener.getNodeCountByLayer(), onReadListener.getBytesReadByLayer(), String.format(Locale.ROOT, "%.2f", recall * 100.0d)); + + if (recall <= 0.9) { + db.run(tr -> + hnsw.kNearestNeighborsSearch(tr, k, 100, true, queryVector).join()); + } + Assertions.assertThat(recall).isGreaterThan(0.9); final long remainingNumNodes = countNodesOnLayer(config, 0); @@ -413,7 +420,6 @@ void testBasicInsertDelete(final long seed, final Config config) { db.run(transaction -> StorageAdapter.fetchAccessInfo(hnsw.getConfig(), transaction, hnsw.getSubspace(), OnReadListener.NOOP).join()); Assertions.assertThat(accessInfo).isNull(); - Assertions.assertThat((Double)null).isNotNull(); } @ParameterizedTest() @@ -839,6 +845,27 @@ private void dumpLayers(@Nonnull final HNSWTest hnswTest, @Nonnull final Config } } + private static class TestOnWriteListener implements OnWriteListener { + final Map deleteCountByLayer; + + public TestOnWriteListener() { + this.deleteCountByLayer = Maps.newConcurrentMap(); + } + + public Map getDeleteCountByLayer() { + return deleteCountByLayer; + } + + public void reset() { + deleteCountByLayer.clear(); + } + + @Override + public void onNodeDeleted(final int layer, @Nonnull final Tuple primaryKey) { + deleteCountByLayer.compute(layer, (l, oldValue) -> (oldValue == null ? 0 : oldValue) + 1L); + } + } + private static class TestOnReadListener implements OnReadListener { final Map nodeCountByLayer; final Map sumMByLayer; From 387533433ec34d18b7709d211a8b3f8877815afe Mon Sep 17 00:00:00 2001 From: Normen Seemann Date: Fri, 12 Dec 2025 19:01:57 +0100 Subject: [PATCH 09/17] better dumping of HNSW layers upon failure --- .../async/hnsw/AbstractStorageAdapter.java | 14 +++++ .../async/hnsw/StorageAdapter.java | 5 ++ .../foundationdb/async/hnsw/HNSWTest.java | 51 +++++++++++-------- 3 files changed, 49 insertions(+), 21 deletions(-) diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/AbstractStorageAdapter.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/AbstractStorageAdapter.java index 9f8296fa2f..85b22f7882 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/AbstractStorageAdapter.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/AbstractStorageAdapter.java @@ -113,6 +113,20 @@ public InliningStorageAdapter asInliningStorageAdapter() { return (InliningStorageAdapter)this; } + @Override + public boolean isCompactStorageAdapter() { + final boolean isCompactStorageAdapter = getNodeFactory().getNodeKind() == NodeKind.COMPACT; + Verify.verify(!isCompactStorageAdapter || this instanceof CompactStorageAdapter); + return isCompactStorageAdapter; + } + + @Nonnull + @Override + public CompactStorageAdapter asCompactStorageAdapter() { + Verify.verify(isCompactStorageAdapter()); + return (CompactStorageAdapter)this; + } + @Override @Nonnull public Subspace getSubspace() { diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageAdapter.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageAdapter.java index af721b1c61..f8b8d719fe 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageAdapter.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageAdapter.java @@ -98,6 +98,11 @@ interface StorageAdapter { @Nonnull InliningStorageAdapter asInliningStorageAdapter(); + boolean isCompactStorageAdapter(); + + @Nonnull + CompactStorageAdapter asCompactStorageAdapter(); + /** * Get the subspace used to store this HNSW structure. * @return the subspace diff --git a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java index f0b936bfc2..7ca4791125 100644 --- a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java +++ b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java @@ -52,8 +52,10 @@ import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.AfterTestExecutionCallback; +import org.junit.jupiter.api.extension.ExtendWith; import org.junit.jupiter.api.extension.ExtensionContext; import org.junit.jupiter.api.extension.RegisterExtension; +import org.junit.jupiter.api.io.TempDir; import org.junit.jupiter.api.parallel.Execution; import org.junit.jupiter.api.parallel.ExecutionMode; import org.junit.jupiter.params.ParameterInfo; @@ -67,9 +69,9 @@ import javax.annotation.Nonnull; import javax.annotation.Nullable; import java.io.BufferedWriter; -import java.io.FileWriter; import java.io.IOException; import java.nio.channels.FileChannel; +import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardOpenOption; @@ -116,6 +118,9 @@ class HNSWTest { @RegisterExtension TestSubspaceExtension rtSecondarySubspace = new TestSubspaceExtension(dbExtension); + @TempDir + Path tempDir; + private Database db; @BeforeEach @@ -316,7 +321,7 @@ void testBasicInsert(final long seed, final Config config) { Assertions.assertThat(readIds.size()).isBetween(10, 50); } - //@ExtendWith(HNSWTest.DumpLayersIfFailure.class) + @ExtendWith(HNSWTest.DumpLayersIfFailure.class) @ParameterizedTest @MethodSource("randomSeedsWithConfig") void testBasicInsertDelete(final long seed, final Config config) { @@ -404,11 +409,6 @@ void testBasicInsertDelete(final long seed, final Config config) { onReadListener.getNodeCountByLayer(), onReadListener.getBytesReadByLayer(), String.format(Locale.ROOT, "%.2f", recall * 100.0d)); - if (recall <= 0.9) { - db.run(tr -> - hnsw.kNearestNeighborsSearch(tr, k, 100, true, queryVector).join()); - } - Assertions.assertThat(recall).isGreaterThan(0.9); final long remainingNumNodes = countNodesOnLayer(config, 0); @@ -710,28 +710,37 @@ private void scanLayer(@Nonnull final Config config, private boolean dumpLayer(@Nonnull final Config config, @Nonnull final String prefix, final int layer) throws IOException { - final String verticesFileName = "/Users/nseemann/Downloads/vertices-" + prefix + "-" + layer + ".csv"; - final String edgesFileName = "/Users/nseemann/Downloads/edges-" + prefix + "-" + layer + ".csv"; + final Path verticesFile = tempDir.resolve("vertices-" + prefix + "-" + layer + ".csv"); + final Path edgesFile = tempDir.resolve("edges-" + prefix + "-" + layer + ".csv"); + + final StorageAdapter storageAdapter = + HNSW.storageAdapterForLayer(config, rtSubspace.getSubspace(), + OnWriteListener.NOOP, OnReadListener.NOOP, layer); final AtomicLong numReadAtomic = new AtomicLong(0L); - try (final BufferedWriter verticesWriter = new BufferedWriter(new FileWriter(verticesFileName)); - final BufferedWriter edgesWriter = new BufferedWriter(new FileWriter(edgesFileName))) { + try (final BufferedWriter verticesWriter = Files.newBufferedWriter(verticesFile); + final BufferedWriter edgesWriter = Files.newBufferedWriter(edgesFile)) { scanLayer(config, layer, 100, node -> { - final CompactNode compactNode = node.asCompactNode(); - final Transformed vector = compactNode.getVector(); + @Nullable final Transformed vector = + storageAdapter.isCompactStorageAdapter() + ? node.asCompactNode().getVector() + : null; try { - verticesWriter.write(compactNode.getPrimaryKey().getLong(0) + ","); - final RealVector realVector = vector.getUnderlyingVector(); - for (int i = 0; i < realVector.getNumDimensions(); i++) { - if (i != 0) { - verticesWriter.write(","); + verticesWriter.write(Long.toString(node.getPrimaryKey().getLong(0))); + if (vector != null) { + verticesWriter.write(","); + final RealVector realVector = vector.getUnderlyingVector(); + for (int i = 0; i < realVector.getNumDimensions(); i++) { + if (i != 0) { + verticesWriter.write(","); + } + verticesWriter.write(String.valueOf(realVector.getComponent(i))); } - verticesWriter.write(String.valueOf(realVector.getComponent(i))); } verticesWriter.newLine(); - for (final var neighbor : compactNode.getNeighbors()) { - edgesWriter.write(compactNode.getPrimaryKey().getLong(0) + "," + + for (final var neighbor : node.getNeighbors()) { + edgesWriter.write(node.getPrimaryKey().getLong(0) + "," + neighbor.getPrimaryKey().getLong(0)); edgesWriter.newLine(); } From 52edf876162ad14e4ed4dc3d8d06bf3b91d59927 Mon Sep 17 00:00:00 2001 From: Normen Seemann Date: Fri, 12 Dec 2025 19:14:28 +0100 Subject: [PATCH 10/17] fixing the style bugs --- build/reports/problems/problems-report.html | 663 ------------------ .../async/hnsw/AbstractStorageAdapter.java | 2 +- .../apple/foundationdb/async/hnsw/HNSW.java | 2 +- .../async/hnsw/StorageTransform.java | 2 + .../foundationdb/async/hnsw/HNSWTest.java | 2 +- 5 files changed, 5 insertions(+), 666 deletions(-) delete mode 100644 build/reports/problems/problems-report.html diff --git a/build/reports/problems/problems-report.html b/build/reports/problems/problems-report.html deleted file mode 100644 index e05028f0f7..0000000000 --- a/build/reports/problems/problems-report.html +++ /dev/null @@ -1,663 +0,0 @@ - - - - - - - - - - - - - Gradle Configuration Cache - - - -

- -
- Loading... -
- - - - - - diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/AbstractStorageAdapter.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/AbstractStorageAdapter.java index 85b22f7882..d34f593cdc 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/AbstractStorageAdapter.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/AbstractStorageAdapter.java @@ -271,5 +271,5 @@ public void deleteNode(@Nonnull final Transaction transaction, final int layer, } } - protected abstract void deleteNodeInternal(@Nonnull final Transaction transaction, final int layer, @Nonnull final Tuple primaryKey); + protected abstract void deleteNodeInternal(@Nonnull Transaction transaction, int layer, @Nonnull Tuple primaryKey); } diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java index 955b8f8901..23d601f231 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java @@ -1833,7 +1833,7 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi @Nonnull final SplittableRandom random, @Nonnull final Tuple primaryKey, final int topLayer) { - return MoreAsyncUtil.forEach(() -> IntStream.rangeClosed(0, topLayer).iterator(), + return forEach(() -> IntStream.rangeClosed(0, topLayer).iterator(), layer -> { final StorageAdapter storageAdapter = getStorageAdapterForLayer(layer); return deleteFromLayer(storageAdapter, transaction, storageTransform, quantizer, random.split(), diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageTransform.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageTransform.java index e286fdc0d2..27cdec9187 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageTransform.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageTransform.java @@ -20,6 +20,7 @@ package com.apple.foundationdb.async.hnsw; +import com.apple.foundationdb.annotation.SpotBugsSuppressWarnings; import com.apple.foundationdb.linear.AffineOperator; import com.apple.foundationdb.linear.FhtKacRotator; import com.apple.foundationdb.linear.LinearOperator; @@ -34,6 +35,7 @@ * (pre-rotated) centroid. This operator is used inside the HNSW to transform back and forth between the coordinate * system of the client and the coordinate system that is currently employed in the HNSW. */ +@SpotBugsSuppressWarnings(value = "SING_SINGLETON_HAS_NONPRIVATE_CONSTRUCTOR", justification = "Singleton designation is a false positive") class StorageTransform extends AffineOperator { private static final StorageTransform IDENTITY_STORAGE_TRANSFORM = new StorageTransform(null, null); diff --git a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java index 7ca4791125..e236731685 100644 --- a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java +++ b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java @@ -318,7 +318,7 @@ void testBasicInsert(final long seed, final Config config) { readIds.clear(); scanLayer(config, 1, 100, node -> Assertions.assertThat(readIds.add(node.getPrimaryKey().getLong(0))).isTrue()); - Assertions.assertThat(readIds.size()).isBetween(10, 50); + Assertions.assertThat(readIds.size()).isBetween(10, 100); } @ExtendWith(HNSWTest.DumpLayersIfFailure.class) From 2b9617c8a2d0fa97d6317fd4f94829de952277cf Mon Sep 17 00:00:00 2001 From: Normen Seemann Date: Sun, 14 Dec 2025 22:43:05 +0100 Subject: [PATCH 11/17] improved tests and polishings --- .../async/hnsw/AbstractStorageAdapter.java | 29 +- .../async/hnsw/BaseNeighborsChangeSet.java | 9 + .../async/hnsw/CompactStorageAdapter.java | 2 - .../apple/foundationdb/async/hnsw/Config.java | 104 +++++-- .../async/hnsw/DeleteNeighborsChangeSet.java | 6 + .../apple/foundationdb/async/hnsw/HNSW.java | 272 ++++++++++++------ .../async/hnsw/InliningStorageAdapter.java | 3 + .../async/hnsw/InsertNeighborsChangeSet.java | 5 + .../async/hnsw/NeighborsChangeSet.java | 6 + .../async/hnsw/NodeReferenceAndNode.java | 4 +- .../async/hnsw/OnWriteListener.java | 66 +++-- .../async/hnsw/StorageAdapter.java | 38 ++- .../foundationdb/async/hnsw/ConfigTest.java | 8 + .../async/hnsw/DataRecordsTest.java | 33 +++ .../foundationdb/async/hnsw/HNSWTest.java | 143 ++++----- .../record/metadata/IndexOptions.java | 16 ++ .../indexes/VectorIndexHelper.java | 8 + .../indexes/VectorIndexMaintainerFactory.java | 3 + .../foundationdb/indexes/VectorIndexTest.java | 8 +- 19 files changed, 543 insertions(+), 220 deletions(-) diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/AbstractStorageAdapter.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/AbstractStorageAdapter.java index d34f593cdc..02570bc2fa 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/AbstractStorageAdapter.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/AbstractStorageAdapter.java @@ -159,23 +159,6 @@ public OnReadListener getOnReadListener() { return onReadListener; } - /** - * Asynchronously fetches a node from a specific layer of the HNSW. - *

- * The node is identified by its {@code layer} and {@code primaryKey}. The entire fetch operation is - * performed within the given {@link ReadTransaction}. After the underlying - * fetch operation completes, the retrieved node is validated by the - * {@link #checkNode(Node)} method before the returned future is completed. - * - * @param readTransaction the non-null transaction to use for the read operation - * @param storageTransform an affine vector transformation operator that is used to transform the fetched vector - * into the storage space that is currently being used - * @param layer the layer of the tree from which to fetch the node - * @param primaryKey the non-null primary key that identifies the node to fetch - * - * @return a {@link CompletableFuture} that will complete with the fetched {@link AbstractNode} - * once it has been read from storage and validated - */ @Nonnull @Override public CompletableFuture> fetchNode(@Nonnull final ReadTransaction readTransaction, @@ -198,7 +181,7 @@ public CompletableFuture> fetchNode(@Nonnull final ReadTransacti * @param primaryKey the primary key that uniquely identifies the node to be fetched; must not be {@code null} * * @return a {@link CompletableFuture} that will be completed with the fetched {@link AbstractNode}. - * The future will complete with {@code null} if no node is found for the given key and layer. + * The future will complete with {@code null} if no node is found for the given key and layer. */ @Nonnull protected abstract CompletableFuture> fetchNodeInternal(@Nonnull ReadTransaction readTransaction, @@ -214,7 +197,7 @@ protected abstract CompletableFuture> fetchNodeInternal(@Nonnull * @return the node that was passed in */ @Nullable - private > T checkNode(@Nullable final T node) { + protected > T checkNode(@Nullable final T node) { return node; } @@ -245,7 +228,7 @@ public void writeNode(@Nonnull final Transaction transaction, @Nonnull final Qua } /** - * Writes a single node to the data store as part of a larger transaction. + * Writes a single node to the given layer of the data store as part of a larger transaction. *

* This is an abstract method that concrete implementations must provide. * It is responsible for the low-level persistence of the given {@code node} at a @@ -271,5 +254,11 @@ public void deleteNode(@Nonnull final Transaction transaction, final int layer, } } + /** + * Deletes a single node from the given layer of the data store as part of a larger transaction. + * @param transaction the transaction to use + * @param layer the layer + * @param primaryKey the primary key of the node + */ protected abstract void deleteNodeInternal(@Nonnull Transaction transaction, int layer, @Nonnull Tuple primaryKey); } diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/BaseNeighborsChangeSet.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/BaseNeighborsChangeSet.java index 490b4bc844..f7ee479920 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/BaseNeighborsChangeSet.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/BaseNeighborsChangeSet.java @@ -67,6 +67,15 @@ public BaseNeighborsChangeSet getParent() { return null; } + /** + * Returns {@code false} as this change set is a base change set. It does not represent any changes. + * @return {@code false} as this change set does not have any changes. + */ + @Override + public boolean hasChanges() { + return false; + } + /** * Retrieves the list of neighbors associated with this object. *

diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/CompactStorageAdapter.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/CompactStorageAdapter.java index debba0d16c..b6fa91b0ab 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/CompactStorageAdapter.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/CompactStorageAdapter.java @@ -94,8 +94,6 @@ public Transformed getVector(@Nonnull final NodeReference nodeRefere * * @return a future that will complete with the fetched {@link AbstractNode} or {@code null} if the node cannot * be fetched - * - * @throws IllegalStateException if the node cannot be found in the database for the given key */ @Nonnull @Override diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/Config.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/Config.java index 66d4322ebf..931f256879 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/Config.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/Config.java @@ -38,6 +38,7 @@ public final class Config { public static final int DEFAULT_M_MAX_0 = 2 * DEFAULT_M; public static final int DEFAULT_M_MAX = DEFAULT_M; public static final int DEFAULT_EF_CONSTRUCTION = 200; + public static final int DEFAULT_EF_REPAIR = 64; public static final boolean DEFAULT_EXTEND_CANDIDATES = false; public static final boolean DEFAULT_KEEP_PRUNED_CONNECTIONS = false; // stats @@ -47,10 +48,10 @@ public final class Config { // RaBitQ public static final boolean DEFAULT_USE_RABITQ = false; public static final int DEFAULT_RABITQ_NUM_EX_BITS = 4; - // concurrency public static final int DEFAULT_MAX_NUM_CONCURRENT_NODE_FETCHES = 16; public static final int DEFAULT_MAX_NUM_CONCURRENT_NEIGHBOR_FETCHES = 16; + public static final int DEFAULT_MAX_NUM_CONCURRENT_DELETE_FROM_LAYER = 2; @Nonnull private final Metric metric; @@ -60,6 +61,7 @@ public final class Config { private final int mMax; private final int mMax0; private final int efConstruction; + private final int efRepair; private final boolean extendCandidates; private final boolean keepPrunedConnections; private final double sampleVectorStatsProbability; @@ -69,13 +71,15 @@ public final class Config { private final int raBitQNumExBits; private final int maxNumConcurrentNodeFetches; private final int maxNumConcurrentNeighborhoodFetches; + private final int maxNumConcurrentDeleteFromLayer; private Config(@Nonnull final Metric metric, final int numDimensions, final boolean useInlining, final int m, - final int mMax, final int mMax0, final int efConstruction, final boolean extendCandidates, - final boolean keepPrunedConnections, final double sampleVectorStatsProbability, - final double maintainStatsProbability, final int statsThreshold, final boolean useRaBitQ, - final int raBitQNumExBits, final int maxNumConcurrentNodeFetches, - final int maxNumConcurrentNeighborhoodFetches) { + final int mMax, final int mMax0, final int efConstruction, final int efRepair, + final boolean extendCandidates, final boolean keepPrunedConnections, + final double sampleVectorStatsProbability, final double maintainStatsProbability, + final int statsThreshold, final boolean useRaBitQ, final int raBitQNumExBits, + final int maxNumConcurrentNodeFetches, final int maxNumConcurrentNeighborhoodFetches, + final int maxNumConcurrentDeleteFromLayer) { Preconditions.checkArgument(numDimensions >= 1, "numDimensions must be (1, MAX_INT]"); Preconditions.checkArgument(m >= 4 && m <= 200, "m must be [4, 200]"); Preconditions.checkArgument(mMax >= 4 && mMax <= 200, "mMax must be [4, 200]"); @@ -84,6 +88,8 @@ private Config(@Nonnull final Metric metric, final int numDimensions, final bool Preconditions.checkArgument(mMax <= mMax0, "mMax must be less than or equal to mMax0"); Preconditions.checkArgument(efConstruction >= 100 && efConstruction <= 400, "efConstruction must be [100, 400]"); + Preconditions.checkArgument(efRepair >= m && efRepair <= 400, + "efRepair must be [m, 400]"); Preconditions.checkArgument(!useRaBitQ || (sampleVectorStatsProbability > 0.0d && sampleVectorStatsProbability <= 1.0d), "sampleVectorStatsProbability out of range"); @@ -98,6 +104,9 @@ private Config(@Nonnull final Metric metric, final int numDimensions, final bool Preconditions.checkArgument(maxNumConcurrentNeighborhoodFetches > 0 && maxNumConcurrentNeighborhoodFetches <= 64, "maxNumConcurrentNeighborhoodFetches must be (0, 64]"); + Preconditions.checkArgument(maxNumConcurrentDeleteFromLayer > 0 && + maxNumConcurrentDeleteFromLayer <= 64, + "maxNumConcurrentDeleteFromLayer must be (0, 64]"); this.metric = metric; this.numDimensions = numDimensions; @@ -106,6 +115,7 @@ private Config(@Nonnull final Metric metric, final int numDimensions, final bool this.mMax = mMax; this.mMax0 = mMax0; this.efConstruction = efConstruction; + this.efRepair = efRepair; this.extendCandidates = extendCandidates; this.keepPrunedConnections = keepPrunedConnections; this.sampleVectorStatsProbability = sampleVectorStatsProbability; @@ -115,6 +125,7 @@ private Config(@Nonnull final Metric metric, final int numDimensions, final bool this.raBitQNumExBits = raBitQNumExBits; this.maxNumConcurrentNodeFetches = maxNumConcurrentNodeFetches; this.maxNumConcurrentNeighborhoodFetches = maxNumConcurrentNeighborhoodFetches; + this.maxNumConcurrentDeleteFromLayer = maxNumConcurrentDeleteFromLayer; } /** @@ -198,14 +209,24 @@ public int getMMax0() { /** * Maximum size of the search queues (one independent queue per layer) that are used during the insertion of a new - * node. If {@code efConstruction} is set to {@code 1}, the search naturally follows a greedy approach - * (monotonous descent), whereas a high number for {@code efConstruction} allows for a more nuanced search that can - * tolerate (false) local minima. + * node. If {@code efConstruction} is set to a smaller number, the search naturally follows a more greedy approach + * (monotonous descent), whereas a higher number for {@code efConstruction} allows for a more nuanced search that + * can tolerate (false) local minima. */ public int getEfConstruction() { return efConstruction; } + /** + * Maximum number of candidate nodes that are considered when a HNSW layer is locally repaired as part of a + * delete operation. A smaller number causes the delete operation to create a smaller set of candidate nodes + * which improves repair performance but not decreases repair quality, a higher number results in qualitatively + * better repairs at the expense of slower performance. + */ + public int getEfRepair() { + return efRepair; + } + /** * Indicator to signal if, during the insertion of a node, the set of nearest neighbors of that node is to be * extended by the actual neighbors of those neighbors to form a set of candidates that the new node may be @@ -283,13 +304,20 @@ public int getMaxNumConcurrentNeighborhoodFetches() { return maxNumConcurrentNeighborhoodFetches; } + /** + * Maximum number of delete operations that can run concurrently during a delete operation. + */ + public int getMaxNumConcurrentDeleteFromLayer() { + return maxNumConcurrentDeleteFromLayer; + } + @Nonnull public ConfigBuilder toBuilder() { return new ConfigBuilder(getMetric(), isUseInlining(), getM(), getMMax(), getMMax0(), - getEfConstruction(), isExtendCandidates(), isKeepPrunedConnections(), + getEfConstruction(), getEfRepair(), isExtendCandidates(), isKeepPrunedConnections(), getSampleVectorStatsProbability(), getMaintainStatsProbability(), getStatsThreshold(), isUseRaBitQ(), getRaBitQNumExBits(), getMaxNumConcurrentNodeFetches(), - getMaxNumConcurrentNeighborhoodFetches()); + getMaxNumConcurrentNeighborhoodFetches(), getMaxNumConcurrentDeleteFromLayer()); } @Override @@ -303,20 +331,23 @@ public boolean equals(final Object o) { final Config config = (Config)o; return numDimensions == config.numDimensions && useInlining == config.useInlining && m == config.m && mMax == config.mMax && mMax0 == config.mMax0 && efConstruction == config.efConstruction && - extendCandidates == config.extendCandidates && keepPrunedConnections == config.keepPrunedConnections && + efRepair == config.efRepair && extendCandidates == config.extendCandidates && + keepPrunedConnections == config.keepPrunedConnections && Double.compare(sampleVectorStatsProbability, config.sampleVectorStatsProbability) == 0 && Double.compare(maintainStatsProbability, config.maintainStatsProbability) == 0 && statsThreshold == config.statsThreshold && useRaBitQ == config.useRaBitQ && raBitQNumExBits == config.raBitQNumExBits && metric == config.metric && maxNumConcurrentNodeFetches == config.maxNumConcurrentNodeFetches && - maxNumConcurrentNeighborhoodFetches == config.maxNumConcurrentNeighborhoodFetches; + maxNumConcurrentNeighborhoodFetches == config.maxNumConcurrentNeighborhoodFetches && + maxNumConcurrentDeleteFromLayer == config.maxNumConcurrentDeleteFromLayer; } @Override public int hashCode() { - return Objects.hash(metric, numDimensions, useInlining, m, mMax, mMax0, efConstruction, extendCandidates, - keepPrunedConnections, sampleVectorStatsProbability, maintainStatsProbability, statsThreshold, - useRaBitQ, raBitQNumExBits, maxNumConcurrentNodeFetches, maxNumConcurrentNeighborhoodFetches); + return Objects.hash(metric, numDimensions, useInlining, m, mMax, mMax0, efConstruction, efRepair, + extendCandidates, keepPrunedConnections, sampleVectorStatsProbability, maintainStatsProbability, + statsThreshold, useRaBitQ, raBitQNumExBits, maxNumConcurrentNodeFetches, + maxNumConcurrentNeighborhoodFetches, maxNumConcurrentDeleteFromLayer); } @Override @@ -325,13 +356,14 @@ public String toString() { return "Config[" + "metric=" + getMetric() + ", numDimensions=" + getNumDimensions() + ", isUseInlining=" + isUseInlining() + ", M=" + getM() + ", MMax=" + getMMax() + ", MMax0=" + getMMax0() + ", efConstruction=" + getEfConstruction() + - ", isExtendCandidates=" + isExtendCandidates() + + ", efRepair=" + getEfRepair() + ", isExtendCandidates=" + isExtendCandidates() + ", isKeepPrunedConnections=" + isKeepPrunedConnections() + ", sampleVectorStatsProbability=" + getSampleVectorStatsProbability() + ", mainStatsProbability=" + getMaintainStatsProbability() + ", statsThreshold=" + getStatsThreshold() + ", useRaBitQ=" + isUseRaBitQ() + ", raBitQNumExBits=" + getRaBitQNumExBits() + ", maxNumConcurrentNodeFetches=" + getMaxNumConcurrentNodeFetches() + ", maxNumConcurrentNeighborhoodFetches=" + getMaxNumConcurrentNeighborhoodFetches() + + ", maxNumConcurrentDeleteFromLayer=" + getMaxNumConcurrentDeleteFromLayer() + "]"; } @@ -350,6 +382,7 @@ public static class ConfigBuilder { private int mMax = DEFAULT_M_MAX; private int mMax0 = DEFAULT_M_MAX_0; private int efConstruction = DEFAULT_EF_CONSTRUCTION; + private int efRepair = DEFAULT_EF_REPAIR; private boolean extendCandidates = DEFAULT_EXTEND_CANDIDATES; private boolean keepPrunedConnections = DEFAULT_KEEP_PRUNED_CONNECTIONS; @@ -362,22 +395,25 @@ public static class ConfigBuilder { private int maxNumConcurrentNodeFetches = DEFAULT_MAX_NUM_CONCURRENT_NODE_FETCHES; private int maxNumConcurrentNeighborhoodFetches = DEFAULT_MAX_NUM_CONCURRENT_NEIGHBOR_FETCHES; + private int maxNumConcurrentDeleteFromLayer = DEFAULT_MAX_NUM_CONCURRENT_DELETE_FROM_LAYER; public ConfigBuilder() { } public ConfigBuilder(@Nonnull final Metric metric, final boolean useInlining, final int m, final int mMax, - final int mMax0, final int efConstruction, final boolean extendCandidates, - final boolean keepPrunedConnections, final double sampleVectorStatsProbability, - final double maintainStatsProbability, final int statsThreshold, final boolean useRaBitQ, - final int raBitQNumExBits, final int maxNumConcurrentNodeFetches, - final int maxNumConcurrentNeighborhoodFetches) { + final int mMax0, final int efConstruction, final int efRepair, + final boolean extendCandidates, final boolean keepPrunedConnections, + final double sampleVectorStatsProbability, final double maintainStatsProbability, + final int statsThreshold, final boolean useRaBitQ, final int raBitQNumExBits, + final int maxNumConcurrentNodeFetches, final int maxNumConcurrentNeighborhoodFetches, + final int maxNumConcurrentDeleteFromLayer) { this.metric = metric; this.useInlining = useInlining; this.m = m; this.mMax = mMax; this.mMax0 = mMax0; this.efConstruction = efConstruction; + this.efRepair = efRepair; this.extendCandidates = extendCandidates; this.keepPrunedConnections = keepPrunedConnections; this.sampleVectorStatsProbability = sampleVectorStatsProbability; @@ -387,6 +423,7 @@ public ConfigBuilder(@Nonnull final Metric metric, final boolean useInlining, fi this.raBitQNumExBits = raBitQNumExBits; this.maxNumConcurrentNodeFetches = maxNumConcurrentNodeFetches; this.maxNumConcurrentNeighborhoodFetches = maxNumConcurrentNeighborhoodFetches; + this.maxNumConcurrentDeleteFromLayer = maxNumConcurrentDeleteFromLayer; } @Nonnull @@ -450,6 +487,16 @@ public ConfigBuilder setEfConstruction(final int efConstruction) { return this; } + public int getEfRepair() { + return efRepair; + } + + @Nonnull + public ConfigBuilder setEfRepair(final int efRepair) { + this.efRepair = efRepair; + return this; + } + public boolean isExtendCandidates() { return extendCandidates; } @@ -538,12 +585,21 @@ public ConfigBuilder setMaxNumConcurrentNeighborhoodFetches(final int maxNumConc return this; } + public int getMaxNumConcurrentDeleteFromLayer() { + return maxNumConcurrentDeleteFromLayer; + } + + public ConfigBuilder setMaxNumConcurrentDeleteFromLayer(final int maxNumConcurrentDeleteFromLayer) { + this.maxNumConcurrentDeleteFromLayer = maxNumConcurrentDeleteFromLayer; + return this; + } + public Config build(final int numDimensions) { return new Config(getMetric(), numDimensions, isUseInlining(), getM(), getMMax(), - getMMax0(), getEfConstruction(), isExtendCandidates(), isKeepPrunedConnections(), + getMMax0(), getEfConstruction(), getEfRepair(), isExtendCandidates(), isKeepPrunedConnections(), getSampleVectorStatsProbability(), getMaintainStatsProbability(), getStatsThreshold(), isUseRaBitQ(), getRaBitQNumExBits(), getMaxNumConcurrentNodeFetches(), - getMaxNumConcurrentNeighborhoodFetches()); + getMaxNumConcurrentNeighborhoodFetches(), getMaxNumConcurrentDeleteFromLayer()); } } } diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/DeleteNeighborsChangeSet.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/DeleteNeighborsChangeSet.java index 194db87eab..0bdd1eb3dd 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/DeleteNeighborsChangeSet.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/DeleteNeighborsChangeSet.java @@ -83,6 +83,12 @@ public NeighborsChangeSet getParent() { return parent; } + @Override + public boolean hasChanges() { + // We can probably do better by testing if the deletion has an effect on the merge. + return true; + } + /** * Merges the neighbors from the parent context, filtering out any neighbors that have been marked as deleted. *

diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java index 23d601f231..56027ce269 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java @@ -462,7 +462,7 @@ private CompletableFuture greedySearchInliningLayer(@ /** * Searches a single layer of the graph to find the nearest neighbors to a query vector. *

- * This method implements the greedy search algorithm used in HNSW (Hierarchical Navigable Small World) + * This method implements the search algorithm used in HNSW (Hierarchical Navigable Small World) * graphs for a specific layer. It begins with a set of entry points and iteratively explores the graph, * always moving towards nodes that are closer to the {@code queryVector}. *

@@ -1260,8 +1260,10 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) final NodeReferenceAndNode selectedNeighbor = selectedNeighbors.get(i); final NeighborsChangeSet changeSet = changeSets.get(i); - storageAdapter.writeNode(transaction, quantizer, - layer, selectedNeighbor.getNode(), changeSet); + if (changeSet.hasChanges()) { + storageAdapter.writeNode(transaction, quantizer, + layer, selectedNeighbor.getNode(), changeSet); + } } return ImmutableList.copyOf(searchResult); }); @@ -1620,27 +1622,23 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) final int layer, @Nonnull final Map> nodeCache) { final Iterable toBeFetched = - resolveNeighborReferences(initialNodeReferenceAndNodes, random, samplingPredicate); + findNeighborReferences(initialNodeReferenceAndNodes, random, samplingPredicate); return fetchNeighborhoodReferences(storageAdapter, readTransaction, storageTransform, layer, toBeFetched, nodeCache); } /** - * Compute the neighbors of an iterable of initial nodes that is passed in. Hop is defined as the - * set of all nodes that are neighbors of the initial nodes. Note that the neighbor of an initial node might - * be another initial node. If that is the case the node is returned. If that is not desired by the caller, the - * caller needs to remove those nodes via a subtraction of the initial set. + * Return the union of the nodes passed in and their neighbors. * - * @param the type of the {@link NodeReference} - * storage space that is currently being used - * @param initialNodeReferenceAndNodes an {@link Iterable} of initial candidate nodes, which have already been evaluated + * @param the type of the {@link NodeReference} storage space that is currently being used + * @param initialNodeReferenceAndNodes an {@link Iterable} of initial candidate nodes * - * @return a {@link CompletableFuture} which will complete with a set of {@link NodeReferenceWithDistance} + * @return a {@link CompletableFuture} which will complete with a set of {@link NodeReference}s */ private Set - resolveNeighborReferences(@Nonnull final Collection> initialNodeReferenceAndNodes, - @Nullable final SplittableRandom random, - @Nonnull final CandidatePredicate candidatePredicate) { + findNeighborReferences(@Nonnull final Collection> initialNodeReferenceAndNodes, + @Nullable final SplittableRandom random, + @Nonnull final CandidatePredicate candidatePredicate) { final Set neighborReferences = Sets.newHashSet(); final ImmutableMap.Builder> initialNodesMapBuilder = ImmutableMap.builder(); for (final NodeReferenceAndNode nodeReferenceAndNode : initialNodeReferenceAndNodes) { @@ -1675,7 +1673,7 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) } } - // sample down the set of neighbors + // sample down the set of neighbors by testing the candidate predicate final ImmutableSet.Builder resultBuilder = ImmutableSet.builder(); for (final NodeReference neighborReference : neighborReferences) { if (candidatePredicate.test(random, initialNodesMap.keySet(), @@ -1749,10 +1747,10 @@ private void writeLonelyNodeOnLayer(@Nonnull final Qua /** * Deletes a vector with its associated primary key from the HNSW graph. *

- * The method first determines a random layer for the new node, called the {@code top layer}. It then applies a - * deletion algorithm to all layers from {@code 0} to including the {@code top layer} that removes the record from - * the index and locally repairs the relationships between nearby other vectors that were affected by the delete - * operation. + * The method first determines the random layer that is used for the node, called the {@code top layer}. It then + * applies a deletion algorithm to all layers from {@code 0} to including the {@code top layer} that removes the + * record from the structure and locally repairs the relationships between nearby other vectors that were affected + * by the delete operation. * * @param transaction the {@link Transaction} context for all database operations * @param primaryKey the unique {@link Tuple} primary key for the new node being inserted @@ -1798,13 +1796,14 @@ public CompletableFuture delete(@Nonnull final Transaction transaction, @N potentialEntryNodeReferences.get(i); if (potentialEntyNodeReference != null) { StorageAdapter.writeAccessInfo(transaction, getSubspace(), - accessInfo.withNewEntryNodeReference(potentialEntyNodeReference), getOnWriteListener()); + accessInfo.withNewEntryNodeReference(potentialEntyNodeReference), + getOnWriteListener()); // early out return AsyncUtil.DONE; } } - // officially there is no data in the structure, delete access info to start new + // there is no data in the structure, delete access info to start new StorageAdapter.deleteAccessInfo(transaction, getSubspace(), getOnWriteListener()); } return AsyncUtil.DONE; @@ -1813,8 +1812,7 @@ public CompletableFuture delete(@Nonnull final Transaction transaction, @N } /** - * Deletes a node from the HNSW graph across multiple layers, using a primary key and starting from a given top - * layer. + * Deletes a node from the HNSW graph across multiple layers, using a primary key and a given top layer. * * @param transaction the transaction to use for database operations * @param storageTransform an affine transformation operator that is used to transform the fetched vector into the @@ -1833,21 +1831,18 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi @Nonnull final SplittableRandom random, @Nonnull final Tuple primaryKey, final int topLayer) { + // delete the node from all layers in parallel (inside layer in [0, topLayer]) return forEach(() -> IntStream.rangeClosed(0, topLayer).iterator(), - layer -> { - final StorageAdapter storageAdapter = getStorageAdapterForLayer(layer); - return deleteFromLayer(storageAdapter, transaction, storageTransform, quantizer, random.split(), - layer, primaryKey); - }, - getConfig().getMaxNumConcurrentNeighborhoodFetches(), + layer -> + deleteFromLayer(getStorageAdapterForLayer(layer), transaction, storageTransform, quantizer, + random.split(), layer, primaryKey), + getConfig().getMaxNumConcurrentDeleteFromLayer(), executor); } /** - * Deletes a node from a specified layer of the HNSW graph. - *

- * This method orchestrates the complete deletion process for a single layer. - *

+ * Deletes a node from a specified layer of the HNSW graph. This method orchestrates the complete deletion process + * for a single layer. * * @param the type of the node reference, extending {@link NodeReference} * @param storageAdapter the storage adapter for reading from and writing to the graph @@ -1887,13 +1882,18 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi .thenCompose(candidates -> { initializeCandidateChangeSetMap(toBeDeletedPrimaryKey, toBeDeletedNode, candidates, candidateChangeSetMap); + // resolve the actually existing direct neighbors final ImmutableList primaryNeighbors = primaryNeighbors(toBeDeletedNode, candidateChangeSetMap); + // + // Repair each primary neighbor in parallel, there should not be much actual I/O, + // except in edge cases, but we should still parallelize it. + // return forEach(primaryNeighbors, neighborReference -> repairNeighbor(storageAdapter, transaction, - storageTransform, quantizer, layer, neighborReference, + storageTransform, estimator, layer, neighborReference, candidates, candidateChangeSetMap, nodeCache), getConfig().getMaxNumConcurrentNeighborhoodFetches(), executor) .thenApply(ignored -> { @@ -1912,6 +1912,11 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi final int currentMMax = layer == 0 ? getConfig().getMMax0() : getConfig().getMMax(); + // + // If we previously went beyond the mMax/mMax0, we need to prune the neighbors. + // Pruning is independent among different nodes -- we can therefore prune in + // parallel. + // return forEach(candidateChangeSetMap.entrySet(), // for each modified neighbor set changeSetEntry -> { final NodeReferenceWithVector candidateReference = @@ -1936,13 +1941,20 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi .thenApply(ignored -> candidateReferencesMap); }) .thenApply(candidateReferencesMap -> { + // + // Finally delete the node we set out to delete and persist the change sets for all + // repaired nodes. + // storageAdapter.deleteNode(transaction, layer, toBeDeletedPrimaryKey); for (final Map.Entry> changeSetEntry : candidateChangeSetMap.entrySet()) { - final AbstractNode candidateNode = - nodeFromCache(changeSetEntry.getKey(), nodeCache); - storageAdapter.writeNode(transaction, quantizer, - layer, candidateNode, changeSetEntry.getValue()); + final NeighborsChangeSet changeSet = changeSetEntry.getValue(); + if (changeSet.hasChanges()) { + final AbstractNode candidateNode = + nodeFromCache(changeSetEntry.getKey(), nodeCache); + storageAdapter.writeNode(transaction, quantizer, + layer, candidateNode, changeSet); + } } // @@ -1966,36 +1978,17 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi }); } - @Nonnull - private ImmutableList - primaryNeighbors(@Nonnull final AbstractNode toBeDeletedNode, - @Nonnull final Map> candidateChangeSetMap) { - // - // All candidates are definitely existing and the candidates hold all existing primary - // candidates. - // - final ImmutableList.Builder primaryNeighborsBuilder = ImmutableList.builder(); - for (N potentialPrimaryNeighbor : toBeDeletedNode.getNeighbors()) { - if (candidateChangeSetMap.containsKey(potentialPrimaryNeighbor.getPrimaryKey())) { - primaryNeighborsBuilder.add(potentialPrimaryNeighbor); - } - } - return primaryNeighborsBuilder.build(); - } - - private - void initializeCandidateChangeSetMap(@Nonnull final Tuple toBeDeletedPrimaryKey, - @Nonnull final AbstractNode toBeDeletedNode, - @Nonnull final List> candidates, - @Nonnull final Map> candidateChangeSetMap) { + private void initializeCandidateChangeSetMap(@Nonnull final Tuple toBeDeletedPrimaryKey, + @Nonnull final AbstractNode toBeDeletedNode, + @Nonnull final List> candidates, + @Nonnull final Map> candidateChangeSetMap) { for (final NodeReferenceAndNode candidate : candidates) { final AbstractNode candidateNode = candidate.getNode(); boolean foundToBeDeleted = false; for (final N neighborOfCandidate : candidateNode.getNeighbors()) { if (neighborOfCandidate.getPrimaryKey().equals(toBeDeletedPrimaryKey)) { // - // Make sure the neighbor pointing to the node-to-be-deleted is deleted as - // well. + // Make sure a neighbor pointing to the node being deleted is deleted as well. // candidateChangeSetMap.put(candidateNode.getPrimaryKey(), new DeleteNeighborsChangeSet<>( @@ -2006,6 +1999,7 @@ void initializeCandidateChangeSetMap(@Nonnull final Tuple toBeDeletedPrimaryKey, } } if (!foundToBeDeleted) { + // if there is no reference back to the node being deleted, just create the base set candidateChangeSetMap.put(candidateNode.getPrimaryKey(), new BaseNeighborsChangeSet<>(candidateNode.getNeighbors())); } @@ -2015,6 +2009,51 @@ void initializeCandidateChangeSetMap(@Nonnull final Tuple toBeDeletedPrimaryKey, } } + /** + * Compile a list of node references that definitely exist. The neighbor list of a node may contain node + * references to neighbors that don't exist anymore (stale reference). The (non-existing) nodes that these node + * references might refer to must not be repaired as that may resurrect a node. + *

+ * We know that the candidate change set map only contains keys for nodes that exist AND that the candidate change + * set map contains all primary neighbors (if they exist). Therefore, we filter the neighbors list from the node by + * cross-referencing the change set map. + * @param type parameter extending {@link NodeReference} + * @param toBeDeletedNode the node that is being deleted. + * @param candidateChangeSetMap the initialized candidate change set map. + * @return a list of existing primary neighbors + */ + @Nonnull + private ImmutableList + primaryNeighbors(@Nonnull final AbstractNode toBeDeletedNode, + @Nonnull final Map> candidateChangeSetMap) { + // + // All entries in the change set map definitely exist and the candidate change set map hold all keys for all + // existing primary candidates. + // + final ImmutableList.Builder primaryNeighborsBuilder = ImmutableList.builder(); + for (final N potentialPrimaryNeighbor : toBeDeletedNode.getNeighbors()) { + if (candidateChangeSetMap.containsKey(potentialPrimaryNeighbor.getPrimaryKey())) { + primaryNeighborsBuilder.add(potentialPrimaryNeighbor); + } + } + return primaryNeighborsBuilder.build(); + } + + /** + * Find candidates starting from the node to be deleted. To this end we find all the existing first degree (primary) + * and second-degree (secondary) neighbors. As that set is too big to consider for the repair we rely on sampling + * to eventually compile a list of roughly {@code efRepair} number of candidates. + * + * @param type parameter extending {@link NodeReference} + * @param storageAdapter the storage adapter for the layer + * @param transaction the transaction + * @param storageTransform the storage transform + * @param random a {@link SplittableRandom} used for sampling the candidate set + * @param layer the layer + * @param toBeDeletedNodeReferenceAndNode the node that is about to be deleted + * @param nodeCache the node cache to avoid repeated fetches + * @return a future that if successful completes with {@code null} + */ @Nonnull private CompletableFuture>> findCandidates(final @Nonnull StorageAdapter storageAdapter, @@ -2027,13 +2066,13 @@ void initializeCandidateChangeSetMap(@Nonnull final Tuple toBeDeletedPrimaryKey, return neighbors(storageAdapter, transaction, storageTransform, random, ImmutableList.of(toBeDeletedNodeReferenceAndNode), ((r, initialNodeKeys, size, nodeReference) -> - usePrimaryCandidateForRepair(nodeReference, + shouldUsePrimaryCandidateForRepair(nodeReference, toBeDeletedNodeReferenceAndNode.getNodeReference().getPrimaryKey())), layer, nodeCache) .thenCompose(candidates -> neighbors(storageAdapter, transaction, storageTransform, random, candidates, ((r, initialNodeKeys, size, nodeReference) -> - useSecondaryCandidateForRepair(r, initialNodeKeys, size, nodeReference, + shouldUseSecondaryCandidateForRepair(r, initialNodeKeys, size, nodeReference, toBeDeletedNodeReferenceAndNode.getNodeReference().getPrimaryKey())), layer, nodeCache)) .thenApply(candidates -> { @@ -2042,33 +2081,51 @@ void initializeCandidateChangeSetMap(@Nonnull final Tuple toBeDeletedPrimaryKey, for (final NodeReferenceAndNode candidate : candidates) { candidateStringsBuilder.add(candidate.getNode().getPrimaryKey().toString()); } - logger.trace("resolved at layer={} num={} candidates={}", layer, candidates.size(), + logger.trace("found at layer={} num={} candidates={}", layer, candidates.size(), String.join(",", candidateStringsBuilder.build())); } return candidates; }); } + /** + * Repair a neighbor node of the node that is being deleted using a set of candidates. All candidates contain only + * the vector (in addition to identifying information like the primary key). The logic in + * computes distances between the neighbor vector and each candidate vector which is required by + * {@link #repairInsForNeighborNode}. + * + * @param type parameter extending {@link NodeReference} + * @param storageAdapter the storage adapter for the layer + * @param transaction the transaction + * @param storageTransform the storage transform + * @param estimator an estimator for distances + * @param layer the layer + * @param neighborReference the reference for which this method repairs incoming references + * @param candidates the set of candidates + * @param neighborChangeSetMap the change set map which records all changes to all nodes that are being repaired + * @param nodeCache the node cache to avoid repeated fetches + * @return a future that if successful completes with {@code null} + */ private @Nonnull CompletableFuture repairNeighbor(@Nonnull final StorageAdapter storageAdapter, @Nonnull final Transaction transaction, @Nonnull final AffineOperator storageTransform, - @Nonnull final Quantizer quantizer, + @Nonnull final Estimator estimator, final int layer, @Nonnull final N neighborReference, - @Nonnull final Collection> sampledCandidates, + @Nonnull final Collection> candidates, @Nonnull final Map> neighborChangeSetMap, @Nonnull final Map> nodeCache) { - final Estimator estimator = quantizer.estimator(); return fetchNodeIfNotCached(storageAdapter, transaction, storageTransform, layer, neighborReference, nodeCache) .thenCompose(neighborNode -> { final ImmutableList.Builder candidatesReferencesBuilder = ImmutableList.builder(); - final Transformed neighborVector = storageAdapter.getVector(neighborReference, neighborNode); + final Transformed neighborVector = + storageAdapter.getVector(neighborReference, neighborNode); // transform the NodeReferencesWithVectors into NodeReferencesWithDistance - for (final NodeReferenceAndNode candidate : sampledCandidates) { + for (final NodeReferenceAndNode candidate : candidates) { // do not add the candidate if that candidate is in fact the neighbor itself if (!candidate.getNodeReference().getPrimaryKey().equals(neighborReference.getPrimaryKey())) { final Transformed candidateVector = @@ -2085,6 +2142,25 @@ void initializeCandidateChangeSetMap(@Nonnull final Tuple toBeDeletedPrimaryKey, }); } + /** + * Repairs the ins of a neighbor node of the node that is being deleted using a set of candidates. Each such + * neighbor is part of a set that is referred to as {@code p_out} in literature. In this method we only repair + * incoming references to this node. As this method is called once per direct neighbor and all direct neighbors are + * in the candidate set, outgoing references from this node to other nodes (in {@code p_out}) are repaired when this + * method is called for the respective neighbors. + * + * @param type parameter extending {@link NodeReference} + * @param storageAdapter the storage adapter for the layer + * @param transaction the transaction + * @param storageTransform the storage transform + * @param estimator an estimator for distances + * @param layer the layer + * @param neighborReference the reference for which this method repairs incoming references + * @param candidates the set of candidates + * @param neighborChangeSetMap the change set map which records all changes to all nodes that are being repaired + * @param nodeCache the node cache to avoid repeated fetches + * @return a future that if successful completes with {@code null} + */ private CompletableFuture repairInsForNeighborNode(@Nonnull final StorageAdapter storageAdapter, @Nonnull final Transaction transaction, @@ -2146,12 +2222,10 @@ private SplittableRandom random(@Nonnull final Tuple primaryKey) { /** * Calculates a layer for a new element to be inserted or for an element to be deleted from. *

- * The layer is selected according to a logarithmic distribution, which ensures that - * the probability of choosing a higher layer decreases exponentially. This is - * achieved by applying the inverse transform sampling method. The specific formula - * is {@code floor(-ln(u) * lambda)}, where {@code u} is a uniform random - * number and {@code lambda} is a normalization factor derived from a system - * configuration parameter {@code M}. + * The layer is selected according to a logarithmic distribution, which ensures that the probability of choosing + * a higher layer decreases exponentially. This is achieved by applying the inverse transform sampling method. + * The specific formula is {@code floor(-ln(u) * lambda)}, where {@code u} is a uniform random number and + * {@code lambda} is a normalization factor derived from a system configuration parameter {@code M}. * @param primaryKey the primary key of the record to be inserted/updated/deleted * @return a non-negative integer representing the randomly selected layer */ @@ -2161,8 +2235,16 @@ private int topLayer(@Nonnull final Tuple primaryKey) { return (int) Math.floor(-Math.log(u) * lambda); } - private boolean usePrimaryCandidateForRepair(@Nonnull final NodeReference candidateReference, - @Nonnull final Tuple toBeDeletedPrimaryKey) { + /** + * Predicate to determine if a potential candidate is to be used as a candidate for repairing the HNSW. + * The predicate rejects the candidate reference if it is referring to the node that is being deleted, otherwise the + * predicate accepts the candidate reference. + * @param candidateReference a potential candidate that is either accepted or rejected + * @param toBeDeletedPrimaryKey the {@link Tuple} representing the node that is being deleted + * @return {@code true} iff {@code candidateReference} is accepted as an actual candidate for repair. + */ + private boolean shouldUsePrimaryCandidateForRepair(@Nonnull final NodeReference candidateReference, + @Nonnull final Tuple toBeDeletedPrimaryKey) { final Tuple candidatePrimaryKey = candidateReference.getPrimaryKey(); // @@ -2172,11 +2254,26 @@ private boolean usePrimaryCandidateForRepair(@Nonnull final NodeReference candid return !candidatePrimaryKey.equals(toBeDeletedPrimaryKey); } - private boolean useSecondaryCandidateForRepair(@Nullable final SplittableRandom random, - @Nonnull final Set initialNodeKeys, - final int numberOfCandidates, - @Nonnull final NodeReference candidateReference, - @Nonnull final Tuple toBeDeletedPrimaryKey) { + /** + * Predicate to determine if a potential candidate is to be used ad a candidate for repairing the HNSW. + *

    + *
  1. The predicate rejects the candidate reference if it is referring to the node that is being deleted.
  2. + *
  3. The predicate always accepts a direct neighbor of the node that is about to be deleted.
  4. + *
  5. Sample the remaining potential candidates such that eventually the repair algorithm can use + * roughly {@code efRepair} actual candidates.
  6. + *
+ * @param random the PRNG to be used (splittable) + * @param initialNodeKeys a set of {@link Tuple}s that hold the primary neighbors of the node being deleted. + * @param numberOfCandidates the number of potential candidates the repair algorithm compiled + * @param candidateReference a potential candidate that is either accepted or rejected + * @param toBeDeletedPrimaryKey the {@link Tuple} representing the node that is being deleted + * @return {@code true} iff {@code candidateReference} is accepted as an actual candidate for repair. + */ + private boolean shouldUseSecondaryCandidateForRepair(@Nullable final SplittableRandom random, + @Nonnull final Set initialNodeKeys, + final int numberOfCandidates, + @Nonnull final NodeReference candidateReference, + @Nonnull final Tuple toBeDeletedPrimaryKey) { final Tuple candidatePrimaryKey = candidateReference.getPrimaryKey(); // @@ -2195,8 +2292,11 @@ private boolean useSecondaryCandidateForRepair(@Nullable final SplittableRandom return true; } - // sample all the rest - final double sampleRate = (double)getConfig().getM() / numberOfCandidates; + // + // Sample all the rest -- For the sampling rate, subtract the size of initialNodeKeys so that we get roughly + // efRepair nodes. + // + final double sampleRate = (double)(getConfig().getEfRepair() - initialNodeKeys.size()) / numberOfCandidates; if (sampleRate >= 1) { return true; } diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InliningStorageAdapter.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InliningStorageAdapter.java index d84c9968b1..5c0c36395a 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InliningStorageAdapter.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InliningStorageAdapter.java @@ -91,6 +91,9 @@ public Transformed getVector(@Nonnull final NodeReferenceWithVector * It then performs an asynchronous range scan to retrieve all key-value pairs associated with that prefix. * Finally, it reconstructs the complete {@link AbstractNode} object from the collected raw data using * the {@code nodeFromRaw} method. + *

+ * Note that when using the inlining storage adapter it is not possible for distinguish between a node that has no + * neighbors and a node that is not present in the database (i.e. it was deleted). * * @param readTransaction the transaction to use for reading from the database * @param storageTransform an affine transformation operator that is used to transform the fetched vector into the diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InsertNeighborsChangeSet.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InsertNeighborsChangeSet.java index b6bfe0d093..0616446bf5 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InsertNeighborsChangeSet.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/InsertNeighborsChangeSet.java @@ -85,6 +85,11 @@ public NeighborsChangeSet getParent() { return parent; } + @Override + public boolean hasChanges() { + return true; + } + /** * Merges the neighbors from this level of the hierarchy with all neighbors from parent levels. *

diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NeighborsChangeSet.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NeighborsChangeSet.java index 207c6a1f1f..98a8e92b9e 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NeighborsChangeSet.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NeighborsChangeSet.java @@ -51,6 +51,12 @@ interface NeighborsChangeSet { @Nullable NeighborsChangeSet getParent(); + /** + * Method to indicate iff changes have been made that need to be persisted. + * @return {@code true} iff changes have been made in this or parent change sets. + */ + boolean hasChanges(); + /** * Merges multiple internal sequences into a single, consolidated iterable sequence. *

diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NodeReferenceAndNode.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NodeReferenceAndNode.java index f64759b36e..4e182c520a 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NodeReferenceAndNode.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NodeReferenceAndNode.java @@ -31,7 +31,9 @@ * This is often used during graph traversal or searching, where a reference to a node (along with its distance from a * query point) is first identified, and then the complete node data is fetched. This class holds these two related * pieces of information together. - * @param the type of {@link NodeReference} used within the {@link AbstractNode} + * @param the type of {@link NodeReference} referencing the node + * @param the type of {@link NodeReference} used within the {@link AbstractNode}, i.e. the type of the neighbor + * references */ class NodeReferenceAndNode { @Nonnull diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/OnWriteListener.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/OnWriteListener.java index 3ec3b5a75e..4f2f434a0c 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/OnWriteListener.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/OnWriteListener.java @@ -33,7 +33,7 @@ public interface OnWriteListener { }; /** - * Callback method invoked after a node has been successfully written to a specific layer. + * Callback method that is invoked after a node has been successfully written to a specific layer. *

* This is a default method with an empty implementation, allowing implementing classes to override it only if they * need to react to this event. @@ -45,19 +45,6 @@ default void onNodeWritten(final int layer, @Nonnull final Node - * This is a default method with an empty implementation, allowing implementing classes to override it only if they - * need to react to this event. - * @param layer the index of the layer where the node was deleted. - * @param primaryKey the {@link Tuple} used as key to identify the node that was deleted; guaranteed to be non-null. - */ - @SuppressWarnings("unused") - default void onNodeDeleted(final int layer, @Nonnull final Tuple primaryKey) { - // nothing - } - /** * Callback method invoked when a neighbor is written for a specific node. *

@@ -77,6 +64,34 @@ default void onNeighborWritten(final int layer, @Nonnull final Node + * This is a default method and its base implementation is a no-op. Implementors of the interface can override this + * method to react to the deletion of a neighbor node, for example, to clean up related resources or update internal + * state. + * @param layer the layer the data was written to + * @param key the key + * @param value the value. + */ + @SuppressWarnings("unused") + default void onKeyValueWritten(final int layer, @Nonnull final byte[] key, @Nonnull final byte[] value) { + // nothing + } + + /** + * Callback method invoked after a node has been successfully deleted from a specific layer. + *

+ * This is a default method with an empty implementation, allowing implementing classes to override it only if they + * need to react to this event. + * @param layer the index of the layer where the node was deleted. + * @param primaryKey the {@link Tuple} used as key to identify the node that was deleted; guaranteed to be non-null. + */ + @SuppressWarnings("unused") + default void onNodeDeleted(final int layer, @Nonnull final Tuple primaryKey) { + // nothing + } + /** * Callback method invoked when a neighbor of a specific node is deleted. *

@@ -93,16 +108,29 @@ default void onNeighborDeleted(final int layer, @Nonnull final Node + * This is a default method and its base implementation is a no-op. Implementors of the interface can override this + * method to react to the deletion of a neighbor node, for example, to clean up related resources or update internal + * state. + * @param layer the layer index where the deletion occurred + * @param key the key that was deleted + */ @SuppressWarnings("unused") default void onKeyDeleted(final int layer, @Nonnull final byte[] key) { // nothing } + /** + * Callback method invoked when an entire range is deleted. + *

+ * This is a default method and its base implementation is a no-op. Implementors of the interface can override this + * method to react to the deletion of a neighbor node, for example, to clean up related resources or update internal + * state. + * @param layer the layer index where the deletion occurred + * @param range the {@link Range} that was deleted + */ @SuppressWarnings("unused") default void onRangeDeleted(final int layer, @Nonnull final Range range) { // nothing diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageAdapter.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageAdapter.java index f8b8d719fe..b84f635356 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageAdapter.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageAdapter.java @@ -93,13 +93,37 @@ interface StorageAdapter { @Nonnull NodeFactory getNodeFactory(); + /** + * Method that returns {@code true} iff this {@link StorageAdapter} is inlining neighboring vectors (i.e. it is + * an {@link InliningStorageAdapter}). + * @return {@code true} iff this {@link StorageAdapter} is inlining neighboring vectors. + */ boolean isInliningStorageAdapter(); + /** + * Method that returns {@code this} object as an {@link InliningStorageAdapter} if this {@link StorageAdapter} is + * inlining neighboring vectors and is an {@link InliningStorageAdapter}. This method throws an exception if this + * storage adapter is any other kind of storage adapter. Callers of this method should ensure prior to calling this + * method that the storage adapter actually is of the right kind (by calling{@link #isInliningStorageAdapter()}. + * @return {@code this} as an {@link InliningStorageAdapter} + */ @Nonnull InliningStorageAdapter asInliningStorageAdapter(); + /** + * Method that returns {@code true} iff this {@link StorageAdapter} is a compact storage adapter which means it is + * not inlining neighboring vectors (i.e. {@code this} is a {@link CompactStorageAdapter}). + * @return {@code true} iff this {@link StorageAdapter} is a {@link CompactStorageAdapter}. + */ boolean isCompactStorageAdapter(); + /** + * Method that returns {@code this} object a {@link CompactStorageAdapter} if this {@link StorageAdapter} is + * a {@link CompactStorageAdapter}. This method throws an exception if this storage adapter is any other kind of + * storage adapter. Callers of this method should ensure prior to calling this method that the storage adapter + * actually is of the right kind (by calling{@link #isCompactStorageAdapter()}. + * @return {@code this} as a {@link CompactStorageAdapter} + */ @Nonnull CompactStorageAdapter asCompactStorageAdapter(); @@ -134,6 +158,16 @@ interface StorageAdapter { @Nonnull OnReadListener getOnReadListener(); + /** + * Method that returns the vector associated with node information passed in. Note that depending on the storage + * layout and therefore the used {@link StorageAdapter}, the vector is either part of the reference + * (when using {@link InliningStorageAdapter}) or is s part of the {@link AbstractNode} itself (when using + * {@link CompactStorageAdapter}). This method hides that detail from the caller and correctly resolves the vector + * for bot use cases. + * @param nodeReference a node reference + * @param node the accompanying node to {@code nodeReference} + * @return the associated vector as {@link Transformed} of {@link RealVector} + */ @Nonnull Transformed getVector(@Nonnull N nodeReference, @Nonnull AbstractNode node); @@ -175,9 +209,9 @@ void writeNode(@Nonnull Transaction transaction, @Nonnull Quantizer quantizer, i @Nonnull AbstractNode node, @Nonnull NeighborsChangeSet changeSet); /** - * Deletes a node from the database. + * Deletes a node from a particular layer in the database. * @param transaction the transaction to use - * @param layer the layer the node should be removed from + * @param layer the layer the node should be deleted from * @param primaryKey the primary key of the node */ void deleteNode(@Nonnull Transaction transaction, int layer, @Nonnull Tuple primaryKey); diff --git a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/ConfigTest.java b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/ConfigTest.java index f6319dae14..bf354e9459 100644 --- a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/ConfigTest.java +++ b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/ConfigTest.java @@ -38,6 +38,7 @@ void testConfig() { final int mMax = Config.DEFAULT_M_MAX + 1; final int mMax0 = Config.DEFAULT_M_MAX_0 + 1; final int efConstruction = Config.DEFAULT_EF_CONSTRUCTION + 1; + final int efRepair = Config.DEFAULT_EF_REPAIR + 1; final boolean extendCandidates = true; final boolean keepPrunedConnections = true; final int statsThreshold = 5000; @@ -49,6 +50,7 @@ void testConfig() { final int maxNumConcurrentNodeFetches = 1; final int maxNumConcurrentNeighborhoodFetches = 2; + final int maxNumConcurrentDeleteFromLayer = Config.DEFAULT_MAX_NUM_CONCURRENT_DELETE_FROM_LAYER + 1; Assertions.assertThat(defaultConfig.getMetric()).isNotSameAs(metric); Assertions.assertThat(defaultConfig.isUseInlining()).isNotEqualTo(useInlining); @@ -56,6 +58,7 @@ void testConfig() { Assertions.assertThat(defaultConfig.getMMax()).isNotEqualTo(mMax); Assertions.assertThat(defaultConfig.getMMax0()).isNotEqualTo(mMax0); Assertions.assertThat(defaultConfig.getEfConstruction()).isNotEqualTo(efConstruction); + Assertions.assertThat(defaultConfig.getEfRepair()).isNotEqualTo(efRepair); Assertions.assertThat(defaultConfig.isExtendCandidates()).isNotEqualTo(extendCandidates); Assertions.assertThat(defaultConfig.isKeepPrunedConnections()).isNotEqualTo(keepPrunedConnections); @@ -68,6 +71,7 @@ void testConfig() { Assertions.assertThat(defaultConfig.getMaxNumConcurrentNodeFetches()).isNotEqualTo(maxNumConcurrentNodeFetches); Assertions.assertThat(defaultConfig.getMaxNumConcurrentNeighborhoodFetches()).isNotEqualTo(maxNumConcurrentNeighborhoodFetches); + Assertions.assertThat(defaultConfig.getMaxNumConcurrentDeleteFromLayer()).isNotEqualTo(maxNumConcurrentDeleteFromLayer); final Config newConfig = defaultConfig.toBuilder() @@ -77,6 +81,7 @@ void testConfig() { .setMMax(mMax) .setMMax0(mMax0) .setEfConstruction(efConstruction) + .setEfRepair(efRepair) .setExtendCandidates(extendCandidates) .setKeepPrunedConnections(keepPrunedConnections) .setSampleVectorStatsProbability(sampleVectorStatsProbability) @@ -86,6 +91,7 @@ void testConfig() { .setRaBitQNumExBits(raBitQNumExBits) .setMaxNumConcurrentNodeFetches(maxNumConcurrentNodeFetches) .setMaxNumConcurrentNeighborhoodFetches(maxNumConcurrentNeighborhoodFetches) + .setMaxNumConcurrentDeleteFromLayer(maxNumConcurrentDeleteFromLayer) .build(768); Assertions.assertThat(newConfig.getMetric()).isSameAs(metric); @@ -94,6 +100,7 @@ void testConfig() { Assertions.assertThat(newConfig.getMMax()).isEqualTo(mMax); Assertions.assertThat(newConfig.getMMax0()).isEqualTo(mMax0); Assertions.assertThat(newConfig.getEfConstruction()).isEqualTo(efConstruction); + Assertions.assertThat(newConfig.getEfRepair()).isEqualTo(efRepair); Assertions.assertThat(newConfig.isExtendCandidates()).isEqualTo(extendCandidates); Assertions.assertThat(newConfig.isKeepPrunedConnections()).isEqualTo(keepPrunedConnections); @@ -106,6 +113,7 @@ void testConfig() { Assertions.assertThat(newConfig.getMaxNumConcurrentNodeFetches()).isEqualTo(maxNumConcurrentNodeFetches); Assertions.assertThat(newConfig.getMaxNumConcurrentNeighborhoodFetches()).isEqualTo(maxNumConcurrentNeighborhoodFetches); + Assertions.assertThat(newConfig.getMaxNumConcurrentDeleteFromLayer()).isEqualTo(maxNumConcurrentDeleteFromLayer); } @Test diff --git a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/DataRecordsTest.java b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/DataRecordsTest.java index 6d316103c8..69c4ade7ac 100644 --- a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/DataRecordsTest.java +++ b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/DataRecordsTest.java @@ -124,6 +124,25 @@ void testResultEntry(final long randomSeed) { assertHashCodeEqualsToString(randomSeed, DataRecordsTest::resultEntry, DataRecordsTest::resultEntry); } + @ParameterizedTest + @RandomSeedSource({0x0fdbL, 0x5ca1eL, 123456L, 78910L, 1123581321345589L}) + void testNodeReferenceAndNode(final long randomSeed) { + assertToString(randomSeed, DataRecordsTest::nodeReferenceAndNode, DataRecordsTest::nodeReferenceAndNode); + } + + private static void assertToString(final long randomSeed, + @Nonnull final Function createFunction, + @Nonnull final BiFunction createDifferentFunction) { + final Random random = new Random(randomSeed); + final long dependentRandomSeed = random.nextLong(); + final T t1 = createFunction.apply(new Random(dependentRandomSeed)); + final T t1Clone = createFunction.apply(new Random(dependentRandomSeed)); + Assertions.assertThat(t1).hasToString(t1Clone.toString()); + + final T t2 = createDifferentFunction.apply(random, t1); + Assertions.assertThat(t1).doesNotHaveToString(t2.toString()); + } + private static void assertHashCodeEqualsToString(final long randomSeed, @Nonnull final Function createFunction, @Nonnull final BiFunction createDifferentFunction) { @@ -140,6 +159,20 @@ private static void assertHashCodeEqualsToString(final long randomSeed, Assertions.assertThat(t1).doesNotHaveToString(t2.toString()); } + @Nonnull + private static NodeReferenceAndNode + nodeReferenceAndNode(@Nonnull final Random random) { + return new NodeReferenceAndNode<>(nodeReferenceWithDistance(random), inliningNode(random)); + } + + @Nonnull + private static NodeReferenceAndNode + nodeReferenceAndNode(@Nonnull final Random random, + @Nonnull final NodeReferenceAndNode original) { + return new NodeReferenceAndNode<>(nodeReferenceWithDistance(random, original.getNodeReference()), + inliningNode(random, original.getNode().asInliningNode())); + } + @Nonnull private static ResultEntry resultEntry(@Nonnull final Random random) { return new ResultEntry(primaryKey(random), rawVector(random), random.nextDouble(), random.nextInt(100)); diff --git a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java index e236731685..0d29f9484c 100644 --- a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java +++ b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java @@ -41,13 +41,13 @@ import com.apple.test.SuperSlow; import com.apple.test.Tags; import com.google.common.base.Verify; +import com.google.common.base.VerifyException; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.ObjectArrays; import com.google.common.collect.Sets; -import org.assertj.core.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; @@ -99,6 +99,8 @@ import static com.apple.foundationdb.linear.RealVectorTest.createRandomDoubleVector; import static com.apple.foundationdb.linear.RealVectorTest.createRandomHalfVector; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.assertj.core.api.Assertions.within; /** @@ -130,12 +132,15 @@ public void setUpDb() { @ParameterizedTest @RandomSeedSource({0x0fdbL, 0x5ca1eL, 123456L, 78910L, 1123581321345589L}) - void testCompactSerialization(final long seed) { + void testCompactSerialization(final long seed) throws Exception { final Random random = new Random(seed); final int numDimensions = 768; final CompactStorageAdapter storageAdapter = new CompactStorageAdapter(HNSW.newConfigBuilder().build(numDimensions), CompactNode.factory(), rtSubspace.getSubspace(), OnWriteListener.NOOP, OnReadListener.NOOP); + assertThat(storageAdapter.asCompactStorageAdapter()).isSameAs(storageAdapter); + assertThatThrownBy(storageAdapter::asInliningStorageAdapter).isInstanceOf(VerifyException.class); + final AbstractNode originalNode = db.run(tr -> { final NodeFactory nodeFactory = storageAdapter.getNodeFactory(); @@ -150,11 +155,11 @@ void testCompactSerialization(final long seed) { db.run(tr -> storageAdapter.fetchNode(tr, AffineOperator.identity(), 0, originalNode.getPrimaryKey()) .thenAccept(node -> - Assertions.assertThat(node).satisfies( - n -> Assertions.assertThat(n).isInstanceOf(CompactNode.class), - n -> Assertions.assertThat(n.getKind()).isSameAs(NodeKind.COMPACT), - n -> Assertions.assertThat((Object)n.getPrimaryKey()).isEqualTo(originalNode.getPrimaryKey()), - n -> Assertions.assertThat(n.asCompactNode().getVector()) + assertThat(node).satisfies( + n -> assertThat(n).isInstanceOf(CompactNode.class), + n -> assertThat(n.getKind()).isSameAs(NodeKind.COMPACT), + n -> assertThat((Object)n.getPrimaryKey()).isEqualTo(originalNode.getPrimaryKey()), + n -> assertThat(n.asCompactNode().getVector()) .isEqualTo(originalNode.asCompactNode().getVector()), n -> { final ArrayList neighbors = @@ -163,20 +168,28 @@ void testCompactSerialization(final long seed) { final ArrayList originalNeighbors = Lists.newArrayList(originalNode.getNeighbors()); originalNeighbors.sort(Comparator.comparing(NodeReference::getPrimaryKey)); - Assertions.assertThat(neighbors).isEqualTo(originalNeighbors); + assertThat(neighbors).isEqualTo(originalNeighbors); } )).join()); + + assertThat( + dumpLayer(HNSW.newConfigBuilder() + .build(numDimensions), "debug", 0)) + .isGreaterThan(0); } @ParameterizedTest @RandomSeedSource({0x0fdbL, 0x5ca1eL, 123456L, 78910L, 1123581321345589L}) - void testInliningSerialization(final long seed) { + void testInliningSerialization(final long seed) throws Exception { final Random random = new Random(seed); final int numDimensions = 768; final InliningStorageAdapter storageAdapter = new InliningStorageAdapter(HNSW.newConfigBuilder().build(numDimensions), InliningNode.factory(), rtSubspace.getSubspace(), OnWriteListener.NOOP, OnReadListener.NOOP); + assertThat(storageAdapter.asInliningStorageAdapter()).isSameAs(storageAdapter); + assertThatThrownBy(storageAdapter::asCompactStorageAdapter).isInstanceOf(VerifyException.class); + final Node originalNode = db.run(tr -> { final NodeFactory nodeFactory = storageAdapter.getNodeFactory(); @@ -184,17 +197,17 @@ void testInliningSerialization(final long seed) { final AbstractNode randomInliningNode = createRandomInliningNode(random, nodeFactory, numDimensions, 16); - writeNode(tr, storageAdapter, randomInliningNode, 0); + writeNode(tr, storageAdapter, randomInliningNode, 1); return randomInliningNode; }); - db.run(tr -> storageAdapter.fetchNode(tr, AffineOperator.identity(), 0, + db.run(tr -> storageAdapter.fetchNode(tr, AffineOperator.identity(), 1, originalNode.getPrimaryKey()) .thenAccept(node -> - Assertions.assertThat(node).satisfies( - n -> Assertions.assertThat(n).isInstanceOf(InliningNode.class), - n -> Assertions.assertThat(n.getKind()).isSameAs(NodeKind.INLINING), - n -> Assertions.assertThat((Object)node.getPrimaryKey()).isEqualTo(originalNode.getPrimaryKey()), + assertThat(node).satisfies( + n -> assertThat(n).isInstanceOf(InliningNode.class), + n -> assertThat(n.getKind()).isSameAs(NodeKind.INLINING), + n -> assertThat((Object)node.getPrimaryKey()).isEqualTo(originalNode.getPrimaryKey()), n -> { final ArrayList neighbors = Lists.newArrayList(node.getNeighbors()); @@ -202,14 +215,20 @@ void testInliningSerialization(final long seed) { final ArrayList originalNeighbors = Lists.newArrayList(originalNode.getNeighbors()); originalNeighbors.sort(Comparator.comparing(NodeReference::getPrimaryKey)); - Assertions.assertThat(neighbors).isEqualTo(originalNeighbors); + assertThat(neighbors).isEqualTo(originalNeighbors); } )).join()); + + assertThat( + dumpLayer(HNSW.newConfigBuilder() + .setUseInlining(true) + .build(numDimensions), "debug", 1)) + .isGreaterThan(0); } static Stream randomSeedsWithConfig() { return RandomizedTestUtils.randomSeeds(0xdeadc0deL) - .flatMap(seed -> Sets.cartesianProduct(ImmutableSet.of(true, false), + .flatMap(seed -> Sets.cartesianProduct(ImmutableSet.of(false, true), ImmutableSet.of(false, true), ImmutableSet.of(false, true), ImmutableSet.of(false, true)).stream() @@ -217,6 +236,7 @@ static Stream randomSeedsWithConfig() { new Object[] {HNSW.newConfigBuilder() .setMetric(Metric.EUCLIDEAN_METRIC) .setUseInlining(arguments.get(0)) + .setEfRepair(64) .setExtendCandidates(arguments.get(1)) .setKeepPrunedConnections(arguments.get(2)) .setUseRaBitQ(arguments.get(3)) @@ -230,38 +250,29 @@ static Stream randomSeedsWithConfig() { .build(128)})))); } + @ExtendWith(HNSWTest.DumpLayersIfFailure.class) @ParameterizedTest @MethodSource("randomSeedsWithConfig") void testBasicInsert(final long seed, final Config config) { final Random random = new Random(seed); final Metric metric = config.getMetric(); + final int size = 1000; + final TestOnWriteListener onWriteListener = new TestOnWriteListener(); final TestOnReadListener onReadListener = new TestOnReadListener(); - final HNSW hnsw = - new HNSW(rtSubspace.getSubspace(), TestExecutors.defaultThreadPool(), config, - OnWriteListener.NOOP, onReadListener); + final HNSW hnsw = new HNSW(rtSubspace.getSubspace(), TestExecutors.defaultThreadPool(), config, + onWriteListener, onReadListener); final int k = 50; - final HalfRealVector queryVector = createRandomHalfVector(random, config.getNumDimensions()); - final TreeSet recordsOrderedByDistance = - new TreeSet<>(Comparator.comparing(PrimaryKeyVectorAndDistance::getDistance)); + final List insertedData = randomVectors(random, config.getNumDimensions(), size); - for (int i = 0; i < 1000; ) { + for (int i = 0; i < size;) { i += basicInsertBatch(hnsw, 100, i, onReadListener, - (tr, nextId) -> { - final var primaryKey = createPrimaryKey(nextId); - final HalfRealVector dataVector = createRandomHalfVector(random, config.getNumDimensions()); - final double distance = metric.distance(dataVector, queryVector); - final PrimaryKeyVectorAndDistance record = - new PrimaryKeyVectorAndDistance(primaryKey, dataVector, distance); - recordsOrderedByDistance.add(record); - if (recordsOrderedByDistance.size() > k) { - recordsOrderedByDistance.pollLast(); - } - return record; - }); + (tr, nextId) -> insertedData.get(Math.toIntExact(nextId))); } + final HalfRealVector queryVector = createRandomHalfVector(random, config.getNumDimensions()); + // // Attempt to mutate some records by updating them using the same primary keys but different random vectors. // This should not fail but should be silently ignored. If this succeeds, the following searches will all @@ -285,7 +296,7 @@ void testBasicInsert(final long seed, final Config config) { final long endTs = System.nanoTime(); final ImmutableSet trueNN = - recordsOrderedByDistance.stream() + orderedByDistances(Metric.EUCLIDEAN_METRIC, insertedData, queryVector).stream() .limit(k) .map(PrimaryKeyVectorAndDistance::getPrimaryKey) .collect(ImmutableSet.toImmutableSet()); @@ -303,7 +314,7 @@ void testBasicInsert(final long seed, final Config config) { TimeUnit.NANOSECONDS.toMillis(endTs - beginTs), onReadListener.getNodeCountByLayer(), onReadListener.getBytesReadByLayer(), String.format(Locale.ROOT, "%.2f", recall * 100.0d)); - Assertions.assertThat(recall).isGreaterThan(0.9); + assertThat(recall).isGreaterThan(0.9); final Set insertedIds = LongStream.range(0, 1000) @@ -312,13 +323,15 @@ void testBasicInsert(final long seed, final Config config) { final Set readIds = Sets.newHashSet(); scanLayer(config, 0, 100, - node -> Assertions.assertThat(readIds.add(node.getPrimaryKey().getLong(0))).isTrue()); - Assertions.assertThat(readIds).isEqualTo(insertedIds); + node -> + assertThat(readIds.add(node.getPrimaryKey().getLong(0))).isTrue()); + assertThat(readIds).isEqualTo(insertedIds); readIds.clear(); scanLayer(config, 1, 100, - node -> Assertions.assertThat(readIds.add(node.getPrimaryKey().getLong(0))).isTrue()); - Assertions.assertThat(readIds.size()).isBetween(10, 100); + node -> + assertThat(readIds.add(node.getPrimaryKey().getLong(0))).isTrue()); + assertThat(readIds.size()).isBetween(10, 100); } @ExtendWith(HNSWTest.DumpLayersIfFailure.class) @@ -334,7 +347,7 @@ void testBasicInsertDelete(final long seed, final Config config) { onWriteListener, onReadListener); final int k = 50; - final List insertedData = randomVectors(random, config.getNumDimensions(), 1000); + final List insertedData = randomVectors(random, config.getNumDimensions(), size); for (int i = 0; i < size;) { i += basicInsertBatch(hnsw, 100, i, onReadListener, @@ -359,7 +372,7 @@ void testBasicInsertDelete(final long seed, final Config config) { }); long endTs = System.nanoTime(); - Assertions.assertThat(onWriteListener.getDeleteCountByLayer().get(0)).isEqualTo(toBeDeleted.size()); + assertThat(onWriteListener.getDeleteCountByLayer().get(0)).isEqualTo(toBeDeleted.size()); logger.info("delete transaction of {} records after {} records took elapsedTime={}ms; read nodes={}, read bytes={}", numVectorsPerDeleteBatch, @@ -409,17 +422,17 @@ void testBasicInsertDelete(final long seed, final Config config) { onReadListener.getNodeCountByLayer(), onReadListener.getBytesReadByLayer(), String.format(Locale.ROOT, "%.2f", recall * 100.0d)); - Assertions.assertThat(recall).isGreaterThan(0.9); + assertThat(recall).isGreaterThan(0.9); final long remainingNumNodes = countNodesOnLayer(config, 0); - Assertions.assertThat(remainingNumNodes).isEqualTo(remainingData.size()); + assertThat(remainingNumNodes).isEqualTo(remainingData.size()); } } while (!remainingData.isEmpty()); final var accessInfo = db.run(transaction -> StorageAdapter.fetchAccessInfo(hnsw.getConfig(), transaction, hnsw.getSubspace(), OnReadListener.NOOP).join()); - Assertions.assertThat(accessInfo).isNull(); + assertThat(accessInfo).isNull(); } @ParameterizedTest() @@ -499,27 +512,27 @@ void testBasicInsertWithRaBitQEncodings(final long seed) { } final RealVector originalVector = dataMap.get(resultEntry.getPrimaryKey()); - Assertions.assertThat(originalVector).isNotNull(); + assertThat(originalVector).isNotNull(); final RealVector fromDBVector = fromDBMap.get(resultEntry.getPrimaryKey()); - Assertions.assertThat(fromDBVector).isNotNull(); + assertThat(fromDBVector).isNotNull(); if (!(fromDBVector instanceof EncodedRealVector)) { - Assertions.assertThat(originalVector).isEqualTo(fromDBVector); + assertThat(originalVector).isEqualTo(fromDBVector); exactVectorCount ++; final double distance = metric.distance(originalVector, Objects.requireNonNull(resultEntry.getVector())); - Assertions.assertThat(distance).isCloseTo(0.0d, within(2E-12)); + assertThat(distance).isCloseTo(0.0d, within(2E-12)); } else { encodedVectorCount ++; final double distance = metric.distance(originalVector, Objects.requireNonNull(resultEntry.getVector()).toDoubleRealVector()); - Assertions.assertThat(distance).isCloseTo(0.0d, within(20.0d)); + assertThat(distance).isCloseTo(0.0d, within(20.0d)); } } final double recall = (double)recallCount / (double)k; - Assertions.assertThat(recall).isGreaterThan(0.9); + assertThat(recall).isGreaterThan(0.9); // must have both kinds - Assertions.assertThat(exactVectorCount).isGreaterThan(0); - Assertions.assertThat(encodedVectorCount).isGreaterThan(0); + assertThat(exactVectorCount).isGreaterThan(0); + assertThat(encodedVectorCount).isGreaterThan(0); } private int basicInsertBatch(final HNSW hnsw, final int batchSize, @@ -553,7 +566,7 @@ void testSIFTInsertSmall() throws Exception { final HNSW hnsw = new HNSW(rtSubspace.getSubspace(), TestExecutors.defaultThreadPool(), HNSW.newConfigBuilder() .setUseRaBitQ(true) - .setRaBitQNumExBits(5) + .setRaBitQNumExBits(6) .setMetric(metric) .setM(32) .setMMax(32) @@ -590,7 +603,7 @@ void testSIFTInsertSmall() throws Exception { return new PrimaryKeyAndVector(currentPrimaryKey, currentVector); }); } - Assertions.assertThat(i).isEqualTo(10000); + assertThat(i).isEqualTo(10000); } validateSIFTSmall(hnsw, dataMap, k); @@ -630,15 +643,15 @@ private void validateSIFTSmall(@Nonnull final HNSW hnsw, @Nonnull final Map void writeNode(@Nonnull final Transaction transaction, @@ -844,7 +857,7 @@ private void dumpLayers(@Nonnull final HNSWTest hnswTest, @Nonnull final Config int layer = 0; while (true) { try { - if (!hnswTest.dumpLayer(config, "debug", layer++)) { + if (hnswTest.dumpLayer(config, "debug", layer++) == 0) { break; } } catch (IOException e) { diff --git a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/metadata/IndexOptions.java b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/metadata/IndexOptions.java index 950f440f96..fe8a5de5ca 100644 --- a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/metadata/IndexOptions.java +++ b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/metadata/IndexOptions.java @@ -282,6 +282,15 @@ public class IndexOptions { */ public static final String HNSW_EF_CONSTRUCTION = "hnswEfConstruction"; + /** + * HNSW-only: Maximum number of candidate nodes that are considered when a HNSW layer is locally repaired as part of + * a delete operation. A smaller number causes the delete operation to create a smaller set of candidate nodes + * which improves repair performance but not decreases repair quality, a higher number results in qualitatively + * better repairs at the expense of slower performance. + * The default value is set to {@link Config#DEFAULT_EF_REPAIR}. See {@link Config#getEfRepair()}. + */ + public static final String HNSW_EF_REPAIR = "hnswEfRepair"; + /** * HNSW-only: Indicator to signal if, during the insertion of a node, the set of nearest neighbors of that node is * to be extended by the actual neighbors of those neighbors to form a set of candidates that the new node may be @@ -354,6 +363,13 @@ public class IndexOptions { */ public static final String HNSW_MAX_NUM_CONCURRENT_NEIGHBORHOOD_FETCHES = "hnswMaxNumConcurrentNeighborhoodFetches"; + /** + * HNSW-only: Maximum number of delete operations that can run concurrently in separate layers during the deletion + * of a record. The default value is set to {@link Config#DEFAULT_MAX_NUM_CONCURRENT_DELETE_FROM_LAYER}. + * See {@link Config#getMaxNumConcurrentDeleteFromLayer()}. + */ + public static final String HNSW_MAX_NUM_CONCURRENT_DELETE_FROM_LAYER = "hnswMaxNumConcurrentDeleteFromLayer"; + private IndexOptions() { } } diff --git a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexHelper.java b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexHelper.java index 845acb9e9f..6e2c7abf8e 100644 --- a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexHelper.java +++ b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexHelper.java @@ -80,6 +80,10 @@ public static Config getConfig(@Nonnull final Index index) { if (hnswEfConstructionOption != null) { builder.setEfConstruction(Integer.parseInt(hnswEfConstructionOption)); } + final String hnswEfRepairOption = index.getOption(IndexOptions.HNSW_EF_REPAIR); + if (hnswEfRepairOption != null) { + builder.setEfRepair(Integer.parseInt(hnswEfRepairOption)); + } final String hnswExtendCandidatesOption = index.getOption(IndexOptions.HNSW_EXTEND_CANDIDATES); if (hnswExtendCandidatesOption != null) { builder.setExtendCandidates(Boolean.parseBoolean(hnswExtendCandidatesOption)); @@ -116,6 +120,10 @@ public static Config getConfig(@Nonnull final Index index) { if (hnswMaxNumConcurrentNeighborhoodFetchesOption != null) { builder.setMaxNumConcurrentNeighborhoodFetches(Integer.parseInt(hnswMaxNumConcurrentNeighborhoodFetchesOption)); } + final String hnswMaxNumConcurrentDeleteFromLayerOption = index.getOption(IndexOptions.HNSW_MAX_NUM_CONCURRENT_DELETE_FROM_LAYER); + if (hnswMaxNumConcurrentDeleteFromLayerOption != null) { + builder.setMaxNumConcurrentDeleteFromLayer(Integer.parseInt(hnswMaxNumConcurrentDeleteFromLayerOption)); + } return builder.build(numDimensions); } diff --git a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexMaintainerFactory.java b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexMaintainerFactory.java index c5f66c381d..2a83fb355a 100644 --- a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexMaintainerFactory.java +++ b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexMaintainerFactory.java @@ -151,6 +151,8 @@ public void validateChangedOptions(@Nonnull final Index oldIndex, oldOptions, newOptions, Config::getMMax0); disallowChange(changedOptions, IndexOptions.HNSW_EF_CONSTRUCTION, oldOptions, newOptions, Config::getEfConstruction); + disallowChange(changedOptions, IndexOptions.HNSW_EF_REPAIR, + oldOptions, newOptions, Config::getEfRepair); disallowChange(changedOptions, IndexOptions.HNSW_EXTEND_CANDIDATES, oldOptions, newOptions, Config::isExtendCandidates); disallowChange(changedOptions, IndexOptions.HNSW_KEEP_PRUNED_CONNECTIONS, @@ -166,6 +168,7 @@ public void validateChangedOptions(@Nonnull final Index oldIndex, changedOptions.remove(IndexOptions.HNSW_STATS_THRESHOLD); changedOptions.remove(IndexOptions.HNSW_MAX_NUM_CONCURRENT_NODE_FETCHES); changedOptions.remove(IndexOptions.HNSW_MAX_NUM_CONCURRENT_NEIGHBORHOOD_FETCHES); + changedOptions.remove(IndexOptions.HNSW_MAX_NUM_CONCURRENT_DELETE_FROM_LAYER); } super.validateChangedOptions(oldIndex, changedOptions); } diff --git a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTest.java b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTest.java index d75bd8d0c1..fa6bdfeb86 100644 --- a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTest.java +++ b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTest.java @@ -311,6 +311,7 @@ void directIndexValidatorTest() throws Exception { .put(IndexOptions.HNSW_M_MAX, "16") .put(IndexOptions.HNSW_M_MAX_0, "32") .put(IndexOptions.HNSW_EF_CONSTRUCTION, "200") + .put(IndexOptions.HNSW_EF_REPAIR, "64") .put(IndexOptions.HNSW_EXTEND_CANDIDATES, "false") .put(IndexOptions.HNSW_KEEP_PRUNED_CONNECTIONS, "false") .put(IndexOptions.HNSW_USE_RABITQ, "false") @@ -321,7 +322,8 @@ void directIndexValidatorTest() throws Exception { .put(IndexOptions.HNSW_MAINTAIN_STATS_PROBABILITY, "0.78") .put(IndexOptions.HNSW_STATS_THRESHOLD, "500") .put(IndexOptions.HNSW_MAX_NUM_CONCURRENT_NODE_FETCHES, "17") - .put(IndexOptions.HNSW_MAX_NUM_CONCURRENT_NEIGHBORHOOD_FETCHES, "9").build()); + .put(IndexOptions.HNSW_MAX_NUM_CONCURRENT_NEIGHBORHOOD_FETCHES, "9") + .put(IndexOptions.HNSW_MAX_NUM_CONCURRENT_DELETE_FROM_LAYER, "5").build()); Assertions.assertThatThrownBy(() -> validateIndexEvolution(metaDataValidator, index, ImmutableMap.of(IndexOptions.HNSW_NUM_DIMENSIONS, "128", @@ -352,6 +354,10 @@ void directIndexValidatorTest() throws Exception { ImmutableMap.of(IndexOptions.HNSW_NUM_DIMENSIONS, "128", IndexOptions.HNSW_EF_CONSTRUCTION, "500"))).isInstanceOf(MetaDataException.class); + Assertions.assertThatThrownBy(() -> validateIndexEvolution(metaDataValidator, index, + ImmutableMap.of(IndexOptions.HNSW_NUM_DIMENSIONS, "128", + IndexOptions.HNSW_EF_REPAIR, "500"))).isInstanceOf(MetaDataException.class); + Assertions.assertThatThrownBy(() -> validateIndexEvolution(metaDataValidator, index, ImmutableMap.of(IndexOptions.HNSW_NUM_DIMENSIONS, "128", IndexOptions.HNSW_EXTEND_CANDIDATES, "true"))).isInstanceOf(MetaDataException.class); From 17d98a2a9d46be4429f932204670ab333b9660f1 Mon Sep 17 00:00:00 2001 From: Normen Seemann Date: Mon, 15 Dec 2025 14:12:03 +0100 Subject: [PATCH 12/17] added test using update/deletes --- .../foundationdb/indexes/VectorIndexTest.java | 183 +++++++++++++++--- .../indexes/VectorIndexTestBase.java | 53 +++-- .../recordrepair/ValidationTestUtils.java | 2 +- 3 files changed, 195 insertions(+), 43 deletions(-) diff --git a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTest.java b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTest.java index fa6bdfeb86..be088b45f0 100644 --- a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTest.java +++ b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTest.java @@ -20,7 +20,6 @@ package com.apple.foundationdb.record.provider.foundationdb.indexes; -import com.apple.foundationdb.async.hnsw.NodeReference; import com.apple.foundationdb.linear.HalfRealVector; import com.apple.foundationdb.linear.Metric; import com.apple.foundationdb.record.Bindings; @@ -59,7 +58,9 @@ import com.apple.foundationdb.record.query.plan.plans.RecordQueryFetchFromPartialRecordPlan; import com.apple.foundationdb.record.query.plan.plans.RecordQueryIndexPlan; import com.apple.foundationdb.record.vector.TestRecordsVectorsProto.VectorRecord; +import com.apple.foundationdb.tuple.Tuple; import com.apple.test.RandomizedTestUtils; +import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.ObjectArrays; @@ -81,7 +82,6 @@ import java.util.Optional; import java.util.Random; import java.util.Set; -import java.util.stream.Collectors; import java.util.stream.Stream; import static com.apple.foundationdb.record.metadata.Key.Expressions.concat; @@ -118,7 +118,7 @@ static Stream randomSeedsWithAsyncAndLimit() { void basicWriteReadTest(final long seed, final boolean useAsync) throws Exception { final Random random = new Random(seed); final List> savedRecords = - saveRecords(useAsync, this::addVectorIndexes, random, 1000, 0.3); + saveRandomRecords(useAsync, this::addVectorIndexes, random, 1000, 0.3); try (final FDBRecordContext context = openContext()) { openRecordStore(context, this::addVectorIndexes); for (int l = 0; l < 1000; l ++) { @@ -140,7 +140,7 @@ void basicWriteIndexReadWithContinuationTest(final long seed, final boolean useA final HalfRealVector queryVector = randomHalfVector(random, 128); final List> savedRecords = - saveRecords(useAsync, this::addUngroupedVectorIndex, random, 1000); + saveRandomRecords(useAsync, this::addUngroupedVectorIndex, random, 1000); final Set expectedResults = sortByDistances(savedRecords, queryVector, Metric.EUCLIDEAN_METRIC).stream() @@ -149,7 +149,7 @@ void basicWriteIndexReadWithContinuationTest(final long seed, final boolean useA nodeReferenceWithDistance.getPrimaryKey().getLong(0)) .collect(ImmutableSet.toImmutableSet()); - final var indexPlan = + final RecordQueryIndexPlan indexPlan = createIndexPlan(queryVector, k, "UngroupedVectorIndex"); verifyRebase(indexPlan); @@ -196,13 +196,17 @@ void basicWriteIndexReadWithContinuationTest(final long seed, final boolean useA @ParameterizedTest @MethodSource("randomSeedsWithAsyncAndLimit") void basicWriteIndexReadGroupedWithContinuationTest(final long seed, final boolean useAsync, final int limit) throws Exception { + final int size = 1000; final int k = 100; final Random random = new Random(seed); final HalfRealVector queryVector = randomHalfVector(random, 128); - final Map> expectedResults = - saveRandomRecords(random, this::addGroupedVectorIndex, useAsync, 1000, queryVector); - final var indexPlan = createIndexPlan(queryVector, k, "GroupedVectorIndex"); + final List> savedRecords = + saveRandomRecords(useAsync, this::addGroupedVectorIndex, random, size); + final Map> randomRecords = groupAndSortByDistances(savedRecords, queryVector); + final Map> expectedResults = trueTopK(randomRecords, k); + + final RecordQueryIndexPlan indexPlan = createIndexPlan(queryVector, k, "GroupedVectorIndex"); verifyRebase(indexPlan); verifySerialization(indexPlan); @@ -249,6 +253,133 @@ void basicWriteIndexReadGroupedWithContinuationTest(final long seed, final boole } } + @ParameterizedTest + @MethodSource("randomSeedsWithAsyncAndLimit") + void insertReadDeleteReadGroupedWithContinuationTest(final long seed, final boolean useAsync, final int limit) throws Exception { + final int size = 1000; + Assertions.assertThat(size % 2).isEqualTo(0); // needs to be even + final int updateBatchSize = 50; + Assertions.assertThat(size % updateBatchSize).isEqualTo(0); // needs to be divisible + + final int k = 100; + final Random random = new Random(seed); + final var savedRecords = saveRandomRecords(useAsync, this::addGroupedVectorIndex, random, size); + + final HalfRealVector queryVector = randomHalfVector(random, 128); + + final RecordQueryIndexPlan indexPlan = createIndexPlan(queryVector, k, "GroupedVectorIndex"); + verifyRebase(indexPlan); + verifySerialization(indexPlan); + + // + // Artificially create a lot of churn. Take the first record and flip its vector with the 999th vector, + // take the second record and flip it with the 998th and so on. We still know the expected ground truth and + // can compensate for that. + // + for (int i = 0; i < size / 2;) { + try (FDBRecordContext context = openContext()) { + openRecordStore(context, this::addGroupedVectorIndex); + for (int b = 0; b < updateBatchSize; b ++) { + final int nearerGroupId = i % 2; + final FDBStoredRecord nearer = + Objects.requireNonNull(recordStore.loadRecord(Tuple.from(nearerGroupId, i))); + final VectorRecord nearerRecord = + VectorRecord.newBuilder() + .mergeFrom(nearer.getRecord()) + .build(); + final int furtherRecId = size - i - 1; + final int furtherGroupId = furtherRecId % 2; + final FDBStoredRecord further = + Objects.requireNonNull(recordStore.loadRecord(Tuple.from(furtherGroupId, furtherRecId))); + final VectorRecord furtherRecord = VectorRecord.newBuilder() + .mergeFrom(further.getRecord()) + .build(); + + final Message newNearer = VectorRecord.newBuilder() + .setRecNo(nearerRecord.getRecNo()) + .setGroupId(nearerRecord.getGroupId()) + .setVectorData(furtherRecord.getVectorData()) + .build(); + final Message newFurther = VectorRecord.newBuilder() + .setRecNo(furtherRecord.getRecNo()) + .setGroupId(furtherRecord.getGroupId()) + .setVectorData(nearerRecord.getVectorData()) + .build(); + + recordStore.updateRecord(newNearer); + recordStore.updateRecord(newFurther); + i ++; + } + commit(context); + } + } + + final List> flippedRecords = + savedRecords + .stream() + .map(storedRecord -> { + final VectorRecord vectorRecord = + VectorRecord.newBuilder() + .mergeFrom(storedRecord.getRecord()) + .build(); + final VectorRecord newVectorRecord = + VectorRecord.newBuilder() + .setGroupId((int)(size - vectorRecord.getRecNo() - 1) % 2) + .setRecNo(size - vectorRecord.getRecNo() - 1) + .setVectorData(vectorRecord.getVectorData()) + .build(); + return FDBStoredRecord.newBuilder() + .setRecord(newVectorRecord) + .setPrimaryKey(storedRecord.getPrimaryKey()) + .setRecordType(storedRecord.getRecordType()) + .build(); + }) + .collect(ImmutableList.toImmutableList()); + final Map> groupedFlippedRecords = groupAndSortByDistances(flippedRecords, queryVector); + final Map> expectedResults = trueTopK(groupedFlippedRecords, k); + + try (FDBRecordContext context = openContext()) { + openRecordStore(context, this::addGroupedVectorIndex); + + final int[] allCounters = new int[2]; + final int[] recallCounters = new int[2]; + byte[] continuation = null; + do { + try (final RecordCursorIterator> cursor = + executeQuery(indexPlan, continuation, Bindings.EMPTY_BINDINGS, limit)) { + int numRecords = 0; + while (cursor.hasNext()) { + final FDBQueriedRecord rec = cursor.next(); + final VectorRecord record = + VectorRecord.newBuilder() + .mergeFrom(Objects.requireNonNull(rec).getRecord()) + .build(); + numRecords++; + allCounters[record.getGroupId()]++; + if (expectedResults.get(record.getGroupId()).contains(record.getRecNo())) { + recallCounters[record.getGroupId()]++; + } + } + if (cursor.getNoNextReason() == RecordCursor.NoNextReason.SOURCE_EXHAUSTED) { + continuation = null; + } else { + continuation = cursor.getContinuation(); + } + if (logger.isInfoEnabled()) { + logger.info("grouped read after deletes and updates {} records, allCounters={}, recallCounters={}", numRecords, allCounters, + recallCounters); + } + } + } while (continuation != null); + assertThat(Ints.asList(allCounters)) + .allSatisfy(allCounter -> + assertThat(allCounter).isEqualTo(k)); + assertThat(Ints.asList(recallCounters)) + .allSatisfy(recallCounter -> + assertThat((double)recallCounter / k).isGreaterThan(0.9)); + } + } + @ParameterizedTest @MethodSource("randomSeedsWithAsync") void deleteWhereGroupedTest(final long seed, final boolean useAsync) throws Exception { @@ -256,9 +387,12 @@ void deleteWhereGroupedTest(final long seed, final boolean useAsync) throws Exce final Random random = new Random(seed); final HalfRealVector queryVector = randomHalfVector(random, 128); - final Map> expectedResults = saveRandomRecords(random, this::addGroupedVectorIndex, - useAsync, 200, queryVector); - final var indexPlan = createIndexPlan(queryVector, k, "GroupedVectorIndex"); + final List> savedRecords = + saveRandomRecords(useAsync, this::addGroupedVectorIndex, random, 200); + final Map> randomRecords = groupAndSortByDistances(savedRecords, queryVector); + final Map> expectedResults = trueTopK(randomRecords, 200); + + final RecordQueryIndexPlan indexPlan = createIndexPlan(queryVector, k, "GroupedVectorIndex"); try (FDBRecordContext context = openContext()) { openRecordStore(context, this::addGroupedVectorIndex); @@ -414,12 +548,15 @@ void directIndexMaintainerTest() throws Exception { @ParameterizedTest @MethodSource("randomSeedsWithReturnVectors") void directIndexReadGroupedWithContinuationTest(final long seed, final boolean returnVectors) throws Exception { + final int size = 1000; final int k = 100; final Random random = new Random(seed); final HalfRealVector queryVector = randomHalfVector(random, 128); - final Map> expectedResults = - saveRandomRecords(random, this::addGroupedVectorIndex, true, 1000, queryVector); + final List> savedRecords = + saveRandomRecords(true, this::addGroupedVectorIndex, random, size); + final Map> randomRecords = groupAndSortByDistances(savedRecords, queryVector); + final Map> expectedResults = trueTopK(randomRecords, k); try (FDBRecordContext context = openContext()) { openRecordStore(context, this::addGroupedVectorIndex); @@ -459,7 +596,7 @@ void directIndexReadGroupedWithContinuationTest(final long seed, final boolean r assertThat(indexEntry.getValue().get(0) != null).isEqualTo(returnVectors); } if (logger.isInfoEnabled()) { - logger.info("grouped read {} records, allCounters={}, recallCounters={}", numRecords, allCounters, + logger.info("(direct) grouped read {} records, allCounters={}, recallCounters={}", numRecords, allCounters, recallCounters); } } @@ -476,10 +613,10 @@ void directIndexReadGroupedWithContinuationTest(final long seed, final boolean r @Nonnull private static RecordQueryIndexPlan createIndexPlan(@Nonnull final HalfRealVector queryVector, final int k, @Nonnull final String indexName) { - final var vectorIndexScanComparisons = + final VectorIndexScanComparisons vectorIndexScanComparisons = createVectorIndexScanComparisons(queryVector, k, VectorIndexScanOptions.empty()); - final var baseRecordType = + final Type.Record baseRecordType = Type.Record.fromFieldDescriptorsMap( Type.Record.toFieldDescriptorMap(VectorRecord.getDescriptor().getFields())); @@ -500,18 +637,4 @@ private static VectorIndexScanComparisons createVectorIndexScanComparisons(@Nonn return VectorIndexScanComparisons.byDistance(ScanComparisons.EMPTY, distanceRankComparison, vectorIndexScanOptions); } - - @Nonnull - private Map> saveRandomRecords(@Nonnull final Random random, @Nonnull final RecordMetaDataHook hook, - final boolean useAsync, final int numSamples, - @Nonnull final HalfRealVector queryVector) throws Exception { - final List> savedRecords = - saveRecords(useAsync, hook, random, numSamples); - - return sortByDistances(savedRecords, queryVector, Metric.EUCLIDEAN_METRIC) - .stream() - .map(NodeReference::getPrimaryKey) - .map(primaryKey -> primaryKey.getLong(0)) - .collect(Collectors.groupingBy(nodeId -> Math.toIntExact(nodeId) % 2, Collectors.toSet())); - } } diff --git a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTestBase.java b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTestBase.java index 525d64f593..3fe7ee9535 100644 --- a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTestBase.java +++ b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTestBase.java @@ -21,6 +21,7 @@ package com.apple.foundationdb.record.provider.foundationdb.indexes; import com.apple.foundationdb.async.AsyncUtil; +import com.apple.foundationdb.async.hnsw.NodeReference; import com.apple.foundationdb.async.hnsw.NodeReferenceWithDistance; import com.apple.foundationdb.half.Half; import com.apple.foundationdb.linear.AffineOperator; @@ -42,6 +43,7 @@ import com.apple.test.Tags; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; import com.google.errorprone.annotations.CanIgnoreReturnValue; import com.google.protobuf.ByteString; import com.google.protobuf.Message; @@ -54,9 +56,12 @@ import java.util.ArrayList; import java.util.Comparator; import java.util.List; +import java.util.Map; import java.util.Random; +import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.function.Function; +import java.util.stream.Collectors; import static com.apple.foundationdb.record.metadata.Key.Expressions.concat; import static com.apple.foundationdb.record.metadata.Key.Expressions.concatenateFields; @@ -133,24 +138,24 @@ protected static HalfRealVector randomHalfVector(final Random random, final int return new HalfRealVector(componentData); } - protected List> saveRecords(final boolean useAsync, - @Nonnull final RecordMetaDataHook hook, - @Nonnull final Random random, - final int numSamples) throws Exception { - return saveRecords(useAsync, hook, random, numSamples, 0.0d); + protected List> saveRandomRecords(final boolean useAsync, + @Nonnull final RecordMetaDataHook hook, + @Nonnull final Random random, + final int numRecords) throws Exception { + return saveRandomRecords(useAsync, hook, random, numRecords, 0.0d); } - protected List> saveRecords(final boolean useAsync, - @Nonnull final RecordMetaDataHook hook, - @Nonnull final Random random, - final int numSamples, - final double nullProbability) throws Exception { + protected List> saveRandomRecords(final boolean useAsync, + @Nonnull final RecordMetaDataHook hook, + @Nonnull final Random random, + final int numRecords, + final double nullProbability) throws Exception { final var recordGenerator = getRecordGenerator(random, nullProbability); if (useAsync) { - return asyncBatch(hook, numSamples, 100, + return asyncBatch(hook, numRecords, 100, recNo -> recordStore.saveRecordAsync(recordGenerator.apply(recNo))); } else { - return batch(hook, numSamples, 100, + return batch(hook, numRecords, 100, recNo -> recordStore.saveRecord(recordGenerator.apply(recNo))); } } @@ -200,6 +205,30 @@ private List> batch(final RecordMetaDataH return records; } + @Nonnull + protected static Map> trueTopK(@Nonnull final Map> sortedByDistances, + final int k) { + return sortedByDistances.entrySet() + .stream() + .collect(Collectors.toMap(Map.Entry::getKey, + entry -> + entry.getValue() + .stream() + .limit(k) + .collect(ImmutableSet.toImmutableSet()))); + } + + @Nonnull + protected static Map> groupAndSortByDistances(@Nonnull final List> savedRecords, + @Nonnull final HalfRealVector queryVector) { + return sortByDistances(savedRecords, queryVector, Metric.EUCLIDEAN_METRIC) + .stream() + .map(NodeReference::getPrimaryKey) + .map(primaryKey -> primaryKey.getLong(0)) + .collect(Collectors.groupingBy(nodeId -> Math.toIntExact(nodeId) % 2, Collectors.toList())); + } + + @Nonnull protected static List sortByDistances(@Nonnull final List> storedRecords, @Nonnull final RealVector queryVector, diff --git a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/recordrepair/ValidationTestUtils.java b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/recordrepair/ValidationTestUtils.java index 6412a4f1a2..9ee88b03da 100644 --- a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/recordrepair/ValidationTestUtils.java +++ b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/recordrepair/ValidationTestUtils.java @@ -42,7 +42,7 @@ public class ValidationTestUtils { private static final int LONG_RECORD_SPACING = 17; - // A few constants for records that were saved with saveRecords() below + // A few constants for records that were saved with saveRandomRecords() below public static final int RECORD_INDEX_WITH_NO_SPLITS = 1; public static final int RECORD_ID_WITH_NO_SPLITS = RECORD_INDEX_WITH_NO_SPLITS + 1; public static final int RECORD_INDEX_WITH_TWO_SPLITS = 16; From e909a7ee72f218d7721046e802b51adc16a9f643 Mon Sep 17 00:00:00 2001 From: Normen Seemann Date: Mon, 15 Dec 2025 21:46:18 +0100 Subject: [PATCH 13/17] protected inlining layer against reinserts --- .../apple/foundationdb/async/hnsw/HNSW.java | 231 +++++++++++------- .../foundationdb/async/hnsw/HNSWTest.java | 4 +- .../foundationdb/indexes/VectorIndexTest.java | 1 - 3 files changed, 148 insertions(+), 88 deletions(-) diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java index 56027ce269..460f59b40d 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java @@ -26,7 +26,6 @@ import com.apple.foundationdb.annotation.API; import com.apple.foundationdb.async.AsyncUtil; import com.apple.foundationdb.async.MoreAsyncUtil; -import com.apple.foundationdb.linear.AffineOperator; import com.apple.foundationdb.linear.Estimator; import com.apple.foundationdb.linear.FhtKacRotator; import com.apple.foundationdb.linear.Metric; @@ -311,7 +310,7 @@ private Quantizer quantizer(@Nullable final AccessInfo accessInfo) { private CompletableFuture> searchFinalLayer(@Nonnull final StorageAdapter storageAdapter, final @Nonnull ReadTransaction readTransaction, - @Nonnull final AffineOperator storageTransform, + @Nonnull final StorageTransform storageTransform, @Nonnull final Estimator estimator, final int k, final int efSearch, @@ -327,7 +326,7 @@ private Quantizer quantizer(@Nullable final AccessInfo accessInfo) { @Nonnull private ImmutableList - postProcessNearestNeighbors(@Nonnull final AffineOperator storageTransform, final int k, + postProcessNearestNeighbors(@Nonnull final StorageTransform storageTransform, final int k, @Nonnull final List> nearestNeighbors, final boolean includeVectors) { final int lastIndex = Math.max(nearestNeighbors.size() - k, 0); @@ -398,6 +397,10 @@ private CompletableFuture greedySearchInliningLayer(@ @Nonnull final NodeReferenceWithDistance nodeReferenceWithDistance, final int layer, @Nonnull final Transformed queryVector) { + final NodeFactory nodeFactory = storageAdapter.getNodeFactory(); + final Map> nodeCache = Maps.newHashMap(); + final Map> updatedNodes = Maps.newHashMap(); + final AtomicReference nearestNodeReferenceAtomic = new AtomicReference<>(null); @@ -408,55 +411,84 @@ private CompletableFuture greedySearchInliningLayer(@ Comparator.comparing(NodeReferenceWithDistance::getDistance)); candidates.add(nodeReferenceWithDistance); - return AsyncUtil.whileTrue(() -> onReadListener.onAsyncRead( - storageAdapter.fetchNode(readTransaction, storageTransform, layer, - Objects.requireNonNull(candidates.peek()).getPrimaryKey())) - .thenCompose(node -> { - if (node == null) { - // - // This cannot happen under normal circumstances as the storage adapter returns a node with no - // neighbors if it already has been deleted. Therefore, it is correct to throw here. - // - throw new IllegalStateException("unable to fetch node"); - } - final InliningNode candidateNode = node.asInliningNode(); - final List neighbors = candidateNode.getNeighbors(); - - if (neighbors.isEmpty()) { - // If there are no neighbors, we either really have no neighbor on this level anymore and the - // node does exist (on layer 0), or not. - return exists(readTransaction, node.getPrimaryKey()) - .thenApply(nodeExists -> nodeExists ? candidateNode : null); - } else { - return CompletableFuture.completedFuture(candidateNode); - } - }) - .thenApply(candidateNode -> { - final NodeReferenceWithDistance candidateReference = Objects.requireNonNull(candidates.poll()); - if (candidateNode != null) { - // - // This node definitely does exist. And it's the nearest one. - // - nearestNodeReferenceAtomic.set(candidateReference); - candidates.clear(); + return AsyncUtil.whileTrue(() -> { + final NodeReferenceWithDistance candidateReference = Objects.requireNonNull(candidates.poll()); + return onReadListener.onAsyncRead( + fetchNodeIfNotCached(storageAdapter, readTransaction, storageTransform, layer, + candidateReference, nodeCache)) + .thenCompose(node -> { + if (node == null) { + // + // This cannot happen under normal circumstances as the storage adapter returns a node with no + // neighbors if it already has been deleted. Therefore, it is correct to throw here. + // + throw new IllegalStateException("unable to fetch node"); + } + final InliningNode candidateNode = node.asInliningNode(); + final List neighbors = candidateNode.getNeighbors(); - // - // Find some new candidates. - // - double minDistance = candidateReference.getDistance(); + if (!neighbors.isEmpty()) { + return CompletableFuture.completedFuture(candidateNode); + } - for (final NodeReferenceWithVector neighbor : candidateNode.getNeighbors()) { - final double distance = - estimator.distance(neighbor.getVector(), queryVector); - if (distance < minDistance) { - candidates.add( - new NodeReferenceWithDistance(neighbor.getPrimaryKey(), neighbor.getVector(), - distance)); + if (updatedNodes.containsKey(candidateReference.getPrimaryKey())) { + return CompletableFuture.completedFuture(updatedNodes.get(candidateReference.getPrimaryKey())); + } + + return fetchBaseNode(readTransaction, storageTransform, candidateReference.getPrimaryKey()) + .thenApply(baseCompactNode -> { + if (baseCompactNode == null) { + // node does not exist on layer 0 + return null; + } + + // + // Node does still exist or an updated version exists -- create new reference + // and push it back into the queue + // + final Transformed baseVector = baseCompactNode.getVector(); + + final double distance = + estimator.distance(baseVector, queryVector); + + final NodeReferenceWithDistance updatedNodeReference = + new NodeReferenceWithDistance(baseCompactNode.getPrimaryKey(), + baseVector, + distance); + candidates.add(updatedNodeReference); + updatedNodes.put(candidateReference.getPrimaryKey(), + nodeFactory.create(candidateReference.getPrimaryKey(), + baseCompactNode.getVector(), candidateNode.getNeighbors())); + return null; + }); + + }) + .thenApply(candidateNode -> { + if (candidateNode != null) { + // + // This node definitely does exist. And it's the nearest one. + // + nearestNodeReferenceAtomic.set(candidateReference); + candidates.clear(); + + // + // Find some new candidates. + // + double minDistance = candidateReference.getDistance(); + + for (final NodeReferenceWithVector neighbor : candidateNode.getNeighbors()) { + final double distance = + estimator.distance(neighbor.getVector(), queryVector); + if (distance < minDistance) { + candidates.add( + new NodeReferenceWithDistance(neighbor.getPrimaryKey(), neighbor.getVector(), + distance)); + } } } - } - return !candidates.isEmpty(); - }), executor).thenApply(ignored -> nearestNodeReferenceAtomic.get()); + return !candidates.isEmpty(); + }); + }, executor).thenApply(ignored -> nearestNodeReferenceAtomic.get()); } /** @@ -493,7 +525,7 @@ private CompletableFuture greedySearchInliningLayer(@ private CompletableFuture>> searchLayer(@Nonnull final StorageAdapter storageAdapter, @Nonnull final ReadTransaction readTransaction, - @Nonnull final AffineOperator storageTransform, + @Nonnull final StorageTransform storageTransform, @Nonnull final Estimator estimator, @Nonnull final Collection nodeReferences, final int layer, @@ -601,8 +633,7 @@ private CompletableFuture greedySearchInliningLayer(@ * fetched from the underlying storage using the {@code storageAdapter}. Once fetched, the node * is added to the {@code nodeCache} before the future is completed. *

- * This is a convenience method that delegates to - * {@link #fetchNodeIfNecessaryAndApply(StorageAdapter, ReadTransaction, AffineOperator, int, NodeReference, Function, BiFunction)}. + * This is a convenience method that delegates to {@link #fetchNodeIfNecessaryAndApply}. * * @param the type of the node reference, which must extend {@link NodeReference} * @param storageAdapter the storage adapter used to fetch the node from persistent storage @@ -619,7 +650,7 @@ private CompletableFuture greedySearchInliningLayer(@ private CompletableFuture> fetchNodeIfNotCached(@Nonnull final StorageAdapter storageAdapter, @Nonnull final ReadTransaction readTransaction, - @Nonnull final AffineOperator storageTransform, + @Nonnull final StorageTransform storageTransform, final int layer, @Nonnull final NodeReference nodeReference, @Nonnull final Map> nodeCache) { @@ -668,7 +699,7 @@ private CompletableFuture greedySearchInliningLayer(@ private CompletableFuture fetchNodeIfNecessaryAndApply(@Nonnull final StorageAdapter storageAdapter, @Nonnull final ReadTransaction readTransaction, - @Nonnull final AffineOperator storageTransform, + @Nonnull final StorageTransform storageTransform, final int layer, @Nonnull final R nodeReference, @Nonnull final Function fetchBypassFunction, @@ -710,7 +741,7 @@ private CompletableFuture greedySearchInliningLayer(@ private CompletableFuture> fetchNeighborhoodReferences(@Nonnull final StorageAdapter storageAdapter, @Nonnull final ReadTransaction readTransaction, - @Nonnull final AffineOperator storageTransform, + @Nonnull final StorageTransform storageTransform, final int layer, @Nonnull final Iterable neighborReferences, @Nonnull final Map> nodeCache) { @@ -769,7 +800,7 @@ private CompletableFuture greedySearchInliningLayer(@ private CompletableFuture>> fetchSomeNodesIfNotCached(@Nonnull final StorageAdapter storageAdapter, @Nonnull final ReadTransaction readTransaction, - @Nonnull final AffineOperator storageTransform, + @Nonnull final StorageTransform storageTransform, final int layer, @Nonnull final Iterable nodeReferences, @Nonnull final Map> nodeCache) { @@ -821,7 +852,7 @@ private CompletableFuture greedySearchInliningLayer(@ private CompletableFuture> fetchSomeNodesAndApply(@Nonnull final StorageAdapter storageAdapter, @Nonnull final ReadTransaction readTransaction, - @Nonnull final AffineOperator storageTransform, + @Nonnull final StorageTransform storageTransform, final int layer, @Nonnull final Iterable nodeReferences, @Nonnull final Function fetchBypassFunction, @@ -954,19 +985,36 @@ public CompletableFuture insert(@Nonnull final Transaction transaction, @N } @Nonnull - private CompletableFuture>> + private CompletableFuture>> filterExisting(@Nonnull final StorageAdapter storageAdapter, @Nonnull final ReadTransaction readTransaction, - @Nonnull final Iterable> nodeReferenceAndNodes) { + @Nonnull final StorageTransform storageTransform, + @Nonnull final Iterable> nodeReferenceAndNodes) { if (!storageAdapter.isInliningStorageAdapter()) { return CompletableFuture.completedFuture(ImmutableList.copyOf(nodeReferenceAndNodes)); } return forEach(nodeReferenceAndNodes, nodeReferenceAndNode -> { - if (nodeReferenceAndNode.getNode().getNeighbors().isEmpty()) { - return exists(readTransaction, nodeReferenceAndNode.getNodeReference().getPrimaryKey()) - .thenApply(nodeExists -> nodeExists ? nodeReferenceAndNode : null); + final AbstractNode node = nodeReferenceAndNode.getNode(); + if (node.getNeighbors().isEmpty()) { + final NodeReferenceWithVector nodeReference = nodeReferenceAndNode.getNodeReference(); + return fetchBaseNode(readTransaction, storageTransform, nodeReference.getPrimaryKey()) + .thenApply(baseCompactNode -> { + if (baseCompactNode == null) { + return null; + } + + // + // The node does exist on layer 0, the base node is a compact node, and we can + // use its vector going forward. This may be necessary if this is a dangling + // reference and the record has been reinserted after deletion. + // + final NodeReferenceWithVector updatedNodeReference = + new NodeReferenceWithVector(baseCompactNode.getPrimaryKey(), + baseCompactNode.getVector()); + return new NodeReferenceAndNode<>(updatedNodeReference, node); + }); } else { // this node has neighbors -- it must exist return CompletableFuture.completedFuture(nodeReferenceAndNode); @@ -975,8 +1023,9 @@ public CompletableFuture insert(@Nonnull final Transaction transaction, @N getConfig().getMaxNumConcurrentNodeFetches(), getExecutor()) .thenApply(results -> { - final ImmutableList.Builder> filteredListBuilder = ImmutableList.builder(); - for (final NodeReferenceAndNode result : results) { + final ImmutableList.Builder> filteredListBuilder = + ImmutableList.builder(); + for (final NodeReferenceAndNode result : results) { if (result != null) { filteredListBuilder.add(result); } @@ -986,19 +1035,31 @@ public CompletableFuture insert(@Nonnull final Transaction transaction, @N } @Nonnull - @VisibleForTesting - CompletableFuture exists(@Nonnull final ReadTransaction readTransaction, - @Nonnull final Tuple primaryKey) { - final StorageAdapter storageAdapter = getStorageAdapterForLayer(0); - + private CompletableFuture exists(@Nonnull final ReadTransaction readTransaction, + @Nonnull final Tuple primaryKey) { // - // Call fetchNode() to check for the node's existence; we are handing in the identity operator, since we don't - // care about the vector itself at all. + // Call fetchBaseNode() to check for the node's existence; we are handing in the identity operator, + // since we do not care about the vector itself at all. // - return storageAdapter.fetchNode(readTransaction, AffineOperator.identity(), 0, primaryKey) + return fetchBaseNode(readTransaction, StorageTransform.identity(), primaryKey) .thenApply(Objects::nonNull); } + @Nonnull + private CompletableFuture fetchBaseNode(@Nonnull final ReadTransaction readTransaction, + @Nonnull final StorageTransform storageTransform, + @Nonnull final Tuple primaryKey) { + final StorageAdapter storageAdapter = getStorageAdapterForLayer(0); + + return storageAdapter.fetchNode(readTransaction, storageTransform, 0, primaryKey) + .thenApply(node -> { + if (node == null) { + return null; + } + return node.asCompactNode(); + }); + } + /** * Method to keep stats if necessary. Stats need to be kept and maintained when the client would like to use * e.g. RaBitQ as RaBitQ needs a stable somewhat correct centroid in order to function properly. @@ -1103,7 +1164,7 @@ private AggregatedVector aggregateVectors(@Nonnull final Iterable @@ -1125,7 +1186,7 @@ private AggregatedVector aggregateVectors(@Nonnull final Iterable insertIntoLayers(@Nonnull final Transaction transaction, - @Nonnull final AffineOperator storageTransform, + @Nonnull final StorageTransform storageTransform, @Nonnull final Quantizer quantizer, @Nonnull final Tuple newPrimaryKey, @Nonnull final Transformed newVector, @@ -1186,7 +1247,7 @@ private CompletableFuture insertIntoLayers(@Nonnull final Transaction tran private CompletableFuture>> insertIntoLayer(@Nonnull final StorageAdapter storageAdapter, @Nonnull final Transaction transaction, - @Nonnull final AffineOperator storageTransform, + @Nonnull final StorageTransform storageTransform, @Nonnull final Quantizer quantizer, @Nonnull final List nearestNeighbors, final int layer, @@ -1367,7 +1428,7 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) private CompletableFuture>> pruneNeighborsIfNecessary(@Nonnull final StorageAdapter storageAdapter, @Nonnull final Transaction transaction, - @Nonnull final AffineOperator storageTransform, + @Nonnull final StorageTransform storageTransform, @Nonnull final Estimator estimator, final int layer, @Nonnull final NodeReferenceWithVector nodeReferenceWithVector, @@ -1437,7 +1498,7 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) private CompletableFuture>> selectCandidates(@Nonnull final StorageAdapter storageAdapter, @Nonnull final ReadTransaction readTransaction, - @Nonnull final AffineOperator storageTransform, + @Nonnull final StorageTransform storageTransform, @Nonnull final Estimator estimator, @Nonnull final Iterable initialCandidates, final int layer, @@ -1526,7 +1587,7 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) private CompletableFuture> extendCandidatesIfNecessary(@Nonnull final StorageAdapter storageAdapter, @Nonnull final ReadTransaction readTransaction, - @Nonnull final AffineOperator storageTransform, + @Nonnull final StorageTransform storageTransform, @Nonnull final Estimator estimator, @Nonnull final Collection> candidates, final int layer, @@ -1578,7 +1639,7 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) private CompletableFuture>> neighbors(@Nonnull final StorageAdapter storageAdapter, @Nonnull final ReadTransaction readTransaction, - @Nonnull final AffineOperator storageTransform, + @Nonnull final StorageTransform storageTransform, @Nonnull final SplittableRandom random, @Nonnull final Collection> initialNodeReferenceAndNodes, @Nonnull final CandidatePredicate samplingPredicate, @@ -1590,7 +1651,7 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) fetchSomeNodesIfNotCached(storageAdapter, readTransaction, storageTransform, layer, neighbors, nodeCache)) .thenCompose(neighbors -> - filterExisting(storageAdapter, readTransaction, neighbors)); + filterExisting(storageAdapter, readTransaction, storageTransform, neighbors)); } /** @@ -1615,7 +1676,7 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) private CompletableFuture> neighborReferences(@Nonnull final StorageAdapter storageAdapter, @Nonnull final ReadTransaction readTransaction, - @Nonnull final AffineOperator storageTransform, + @Nonnull final StorageTransform storageTransform, @Nullable final SplittableRandom random, @Nonnull final Collection> initialNodeReferenceAndNodes, @Nonnull final CandidatePredicate samplingPredicate, @@ -1826,7 +1887,7 @@ public CompletableFuture delete(@Nonnull final Transaction transaction, @N */ @Nonnull private CompletableFuture> deleteFromLayers(@Nonnull final Transaction transaction, - @Nonnull final AffineOperator storageTransform, + @Nonnull final StorageTransform storageTransform, @Nonnull final Quantizer quantizer, @Nonnull final SplittableRandom random, @Nonnull final Tuple primaryKey, @@ -1859,7 +1920,7 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi private CompletableFuture deleteFromLayer(@Nonnull final StorageAdapter storageAdapter, @Nonnull final Transaction transaction, - @Nonnull final AffineOperator storageTransform, + @Nonnull final StorageTransform storageTransform, @Nonnull final Quantizer quantizer, @Nonnull final SplittableRandom random, final int layer, @@ -2058,7 +2119,7 @@ private void initializeCandidateChangeSetMap(@Nonnull private CompletableFuture>> findCandidates(final @Nonnull StorageAdapter storageAdapter, final @Nonnull Transaction transaction, - final @Nonnull AffineOperator storageTransform, + final @Nonnull StorageTransform storageTransform, final @Nonnull SplittableRandom random, final int layer, final NodeReferenceAndNode toBeDeletedNodeReferenceAndNode, @@ -2109,7 +2170,7 @@ private void initializeCandidateChangeSetMap(@Nonnull private @Nonnull CompletableFuture repairNeighbor(@Nonnull final StorageAdapter storageAdapter, @Nonnull final Transaction transaction, - @Nonnull final AffineOperator storageTransform, + @Nonnull final StorageTransform storageTransform, @Nonnull final Estimator estimator, final int layer, @Nonnull final N neighborReference, @@ -2164,7 +2225,7 @@ private void initializeCandidateChangeSetMap(@Nonnull private CompletableFuture repairInsForNeighborNode(@Nonnull final StorageAdapter storageAdapter, @Nonnull final Transaction transaction, - @Nonnull final AffineOperator storageTransform, + @Nonnull final StorageTransform storageTransform, @Nonnull final Estimator estimator, final int layer, @Nonnull final N neighborReference, diff --git a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java index 0d29f9484c..961374115b 100644 --- a/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java +++ b/fdb-extensions/src/test/java/com/apple/foundationdb/async/hnsw/HNSWTest.java @@ -228,7 +228,7 @@ void testInliningSerialization(final long seed) throws Exception { static Stream randomSeedsWithConfig() { return RandomizedTestUtils.randomSeeds(0xdeadc0deL) - .flatMap(seed -> Sets.cartesianProduct(ImmutableSet.of(false, true), + .flatMap(seed -> Sets.cartesianProduct(ImmutableSet.of(true, false), ImmutableSet.of(false, true), ImmutableSet.of(false, true), ImmutableSet.of(false, true)).stream() @@ -846,7 +846,7 @@ public void afterTestExecution(@Nonnull final ExtensionContext context) { final HNSWTest hnswTest = (HNSWTest)context.getRequiredTestInstance(); final Config config = (Config)args.get(1); - logger.error("dumping contents of HNSW to disk"); + logger.error("dumping contents of HNSW to {}", hnswTest.tempDir.toString()); dumpLayers(hnswTest, config); } else { logger.error("test failed with no parameterized arguments (non-parameterized test or older JUnit)."); diff --git a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTest.java b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTest.java index be088b45f0..ca0af4b175 100644 --- a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTest.java +++ b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTest.java @@ -578,7 +578,6 @@ void directIndexReadGroupedWithContinuationTest(final long seed, final boolean r .setState(ExecuteState.NO_LIMITS) .setReturnedRowLimit(Integer.MAX_VALUE).build().asScanProperties(false); - try (final RecordCursor cursor = indexMaintainer.scan(vectorIndexScanComparisons.bind(recordStore, index, EvaluationContext.empty()), null, scanProperties)) { From 759378cdb2e6e0153e55e5cf77c585a522eb76ac Mon Sep 17 00:00:00 2001 From: Normen Seemann Date: Tue, 16 Dec 2025 10:22:03 +0100 Subject: [PATCH 14/17] removing some test code replication in VectorIndexTest --- .../apple/foundationdb/async/hnsw/HNSW.java | 4 +- .../foundationdb/indexes/VectorIndexTest.java | 119 +++++++----------- 2 files changed, 45 insertions(+), 78 deletions(-) diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java index 460f59b40d..080f87bd58 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java @@ -1006,8 +1006,8 @@ public CompletableFuture insert(@Nonnull final Transaction transaction, @N } // - // The node does exist on layer 0, the base node is a compact node, and we can - // use its vector going forward. This may be necessary if this is a dangling + // The node does exist on layer 0 meaning the base node is a compact node, and we + // can use its vector going forward. This may be necessary if this is a dangling // reference and the record has been reinserted after deletion. // final NodeReferenceWithVector updatedNodeReference = diff --git a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTest.java b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTest.java index ca0af4b175..9a8dad467c 100644 --- a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTest.java +++ b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/indexes/VectorIndexTest.java @@ -63,6 +63,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Maps; import com.google.common.collect.ObjectArrays; import com.google.common.collect.Sets; import com.google.common.primitives.Ints; @@ -82,6 +83,7 @@ import java.util.Optional; import java.util.Random; import java.util.Set; +import java.util.stream.IntStream; import java.util.stream.Stream; import static com.apple.foundationdb.record.metadata.Key.Expressions.concat; @@ -152,6 +154,12 @@ void basicWriteIndexReadWithContinuationTest(final long seed, final boolean useA final RecordQueryIndexPlan indexPlan = createIndexPlan(queryVector, k, "UngroupedVectorIndex"); + checkResults(indexPlan, limit, expectedResults); + } + + private void checkResults(@Nonnull final RecordQueryIndexPlan indexPlan, + final int limit, + @Nonnull final Set expectedResults) throws Exception { verifyRebase(indexPlan); verifySerialization(indexPlan); @@ -188,8 +196,8 @@ void basicWriteIndexReadWithContinuationTest(final long seed, final boolean useA } } } while (continuation != null); - assertThat(allCounter).isEqualTo(k); - assertThat((double)recallCounter / k).isGreaterThan(0.9); + assertThat(allCounter).isEqualTo(expectedResults.size()); + assertThat((double)recallCounter / expectedResults.size()).isGreaterThan(0.9); } } @@ -208,10 +216,15 @@ void basicWriteIndexReadGroupedWithContinuationTest(final long seed, final boole final RecordQueryIndexPlan indexPlan = createIndexPlan(queryVector, k, "GroupedVectorIndex"); + checkResultsGrouped(indexPlan, limit, expectedResults); + } + + private void checkResultsGrouped(@Nonnull final RecordQueryIndexPlan indexPlan, final int limit, + @Nonnull final Map> expectedResults) throws Exception { verifyRebase(indexPlan); verifySerialization(indexPlan); - try (FDBRecordContext context = openContext()) { + try (final FDBRecordContext context = openContext()) { openRecordStore(context, this::addGroupedVectorIndex); final int[] allCounters = new int[2]; @@ -244,12 +257,24 @@ void basicWriteIndexReadGroupedWithContinuationTest(final long seed, final boole } } } while (continuation != null); - assertThat(Ints.asList(allCounters)) - .allSatisfy(allCounter -> - assertThat(allCounter).isEqualTo(k)); - assertThat(Ints.asList(recallCounters)) - .allSatisfy(recallCounter -> - assertThat((double)recallCounter / k).isGreaterThan(0.9)); + + IntStream.range(0, allCounters.length) + .forEach(index -> { + assertThat(allCounters[index]) + .as("allCounters[%d]", index) + .satisfies(allCountersAtIndex -> { + assertThat(allCountersAtIndex).isEqualTo( + expectedResults.getOrDefault(index, ImmutableSet.of()).size()); + }); + assertThat(recallCounters[index]) + .as("recallCounters[%d]", index) + .satisfies(recallCountersAtIndex -> { + assertThat((double)recallCountersAtIndex / + expectedResults.getOrDefault(index, ImmutableSet.of()).size()) + .isGreaterThan(0.9); + }); + + }); } } @@ -267,10 +292,6 @@ void insertReadDeleteReadGroupedWithContinuationTest(final long seed, final bool final HalfRealVector queryVector = randomHalfVector(random, 128); - final RecordQueryIndexPlan indexPlan = createIndexPlan(queryVector, k, "GroupedVectorIndex"); - verifyRebase(indexPlan); - verifySerialization(indexPlan); - // // Artificially create a lot of churn. Take the first record and flip its vector with the 999th vector, // take the second record and flip it with the 998th and so on. We still know the expected ground truth and @@ -338,46 +359,9 @@ void insertReadDeleteReadGroupedWithContinuationTest(final long seed, final bool final Map> groupedFlippedRecords = groupAndSortByDistances(flippedRecords, queryVector); final Map> expectedResults = trueTopK(groupedFlippedRecords, k); - try (FDBRecordContext context = openContext()) { - openRecordStore(context, this::addGroupedVectorIndex); + final RecordQueryIndexPlan indexPlan = createIndexPlan(queryVector, k, "GroupedVectorIndex"); - final int[] allCounters = new int[2]; - final int[] recallCounters = new int[2]; - byte[] continuation = null; - do { - try (final RecordCursorIterator> cursor = - executeQuery(indexPlan, continuation, Bindings.EMPTY_BINDINGS, limit)) { - int numRecords = 0; - while (cursor.hasNext()) { - final FDBQueriedRecord rec = cursor.next(); - final VectorRecord record = - VectorRecord.newBuilder() - .mergeFrom(Objects.requireNonNull(rec).getRecord()) - .build(); - numRecords++; - allCounters[record.getGroupId()]++; - if (expectedResults.get(record.getGroupId()).contains(record.getRecNo())) { - recallCounters[record.getGroupId()]++; - } - } - if (cursor.getNoNextReason() == RecordCursor.NoNextReason.SOURCE_EXHAUSTED) { - continuation = null; - } else { - continuation = cursor.getContinuation(); - } - if (logger.isInfoEnabled()) { - logger.info("grouped read after deletes and updates {} records, allCounters={}, recallCounters={}", numRecords, allCounters, - recallCounters); - } - } - } while (continuation != null); - assertThat(Ints.asList(allCounters)) - .allSatisfy(allCounter -> - assertThat(allCounter).isEqualTo(k)); - assertThat(Ints.asList(recallCounters)) - .allSatisfy(recallCounter -> - assertThat((double)recallCounter / k).isGreaterThan(0.9)); - } + checkResultsGrouped(indexPlan, limit, expectedResults); } @ParameterizedTest @@ -390,35 +374,18 @@ void deleteWhereGroupedTest(final long seed, final boolean useAsync) throws Exce final List> savedRecords = saveRandomRecords(useAsync, this::addGroupedVectorIndex, random, 200); final Map> randomRecords = groupAndSortByDistances(savedRecords, queryVector); - final Map> expectedResults = trueTopK(randomRecords, 200); - - final RecordQueryIndexPlan indexPlan = createIndexPlan(queryVector, k, "GroupedVectorIndex"); + final Map> expectedResults = + Maps.filterKeys( + trueTopK(randomRecords, 200), key -> Objects.requireNonNull(key) % 2 != 0); try (FDBRecordContext context = openContext()) { openRecordStore(context, this::addGroupedVectorIndex); recordStore.deleteRecordsWhere(Query.field("group_id").equalsValue(0)); - - final int[] allCounters = new int[2]; - final int[] recallCounters = new int[2]; - try (final RecordCursorIterator> cursor = executeQuery(indexPlan)) { - while (cursor.hasNext()) { - final FDBQueriedRecord rec = cursor.next(); - final VectorRecord record = - VectorRecord.newBuilder() - .mergeFrom(Objects.requireNonNull(rec).getRecord()) - .build(); - allCounters[record.getGroupId()] ++; - if (expectedResults.get(record.getGroupId()).contains(record.getRecNo())) { - recallCounters[record.getGroupId()] ++; - } - } - } - assertThat(allCounters[0]).isEqualTo(0); - assertThat(allCounters[1]).isEqualTo(k); - - assertThat((double)recallCounters[0] / k).isEqualTo(0.0); - assertThat((double)recallCounters[1] / k).isGreaterThan(0.9); + commit(context); } + + final RecordQueryIndexPlan indexPlan = createIndexPlan(queryVector, k, "GroupedVectorIndex"); + checkResultsGrouped(indexPlan, Integer.MAX_VALUE, expectedResults); } @Test From cc704839dfed5e0d4f286f1e2f952bc5a5b5140a Mon Sep 17 00:00:00 2001 From: Normen Seemann Date: Tue, 16 Dec 2025 15:56:26 +0100 Subject: [PATCH 15/17] improved the javadoc for delete(.) --- .../apple/foundationdb/async/hnsw/HNSW.java | 45 +++++++++++++++---- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java index 080f87bd58..1f853dade3 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java @@ -1806,17 +1806,46 @@ private void writeLonelyNodeOnLayer(@Nonnull final Qua } /** - * Deletes a vector with its associated primary key from the HNSW graph. + * Deletes a record using its associated primary key from the HNSW graph. *

- * The method first determines the random layer that is used for the node, called the {@code top layer}. It then - * applies a deletion algorithm to all layers from {@code 0} to including the {@code top layer} that removes the - * record from the structure and locally repairs the relationships between nearby other vectors that were affected - * by the delete operation. + * This method implements a multi-layer deletion algorithm that maintains the structural integrity of the HNSW + * graph. The deletion process consists of several key phases: + *

    + *
  • Layer Determination: First determines the top layer for the node using the same deterministic + * algorithm used during insertion, ensuring consistent layer assignment across operations. + *
  • + *
  • Existence Verification: Checks whether the node actually exists in the graph before attempting + * deletion. If the node doesn't exist, the operation completes immediately without error. + *
  • + *
  • Multi-Layer Deletion: Removes the node from all layers spanning from layer 0 (base layer + * containing all nodes) up to and including the node's top layer. The deletion is performed in parallel + * across all layers for optimal performance. + *
  • + *
  • Graph Repair: For each layer where the node is deleted, the algorithm repairs the local graph + * structure by identifying the deleted node's neighbors and reconnecting them appropriately. This process: + *
      + *
    • Finds candidate replacement connections among the neighbors of neighbors
    • + *
    • Selects optimal new connections using the HNSW distance heuristics
    • + *
    • Updates neighbor lists to maintain graph connectivity and search performance
    • + *
    • Applies connection limits (M, MMax) and prunes excess connections if necessary
    • + *
    + *
  • + *
  • Entry Point Management: If the deleted node was serving as the graph's entry point (the starting + * node for search operations), the method automatically selects a new entry point from the remaining nodes + * at the highest available layer. If no nodes remain after deletion, the access information is cleared, + * effectively resetting the graph to an empty state. + *
  • + *
+ * All operations are performed transactionally and asynchronously, ensuring consistency and enabling + * non-blocking execution in concurrent environments. * - * @param transaction the {@link Transaction} context for all database operations - * @param primaryKey the unique {@link Tuple} primary key for the new node being inserted + * @param transaction the {@link Transaction} context for all database operations, ensuring atomicity + * and consistency of the deletion and repair operations + * @param primaryKey the unique {@link Tuple} primary key identifying the node to be deleted from the graph * - * @return a {@link CompletableFuture} that completes when the insertion operation is finished + * @return a {@link CompletableFuture} that completes when the deletion operation is fully finished, + * including all graph repairs and entry point updates. The future completes with {@code null} + * on successful deletion. */ @Nonnull public CompletableFuture delete(@Nonnull final Transaction transaction, @Nonnull final Tuple primaryKey) { From b9e10c0393e127147f05052062e2bd2aa6db1cb9 Mon Sep 17 00:00:00 2001 From: Normen Seemann Date: Wed, 17 Dec 2025 10:47:18 +0100 Subject: [PATCH 16/17] addressing comments --- .../apple/foundationdb/async/hnsw/Config.java | 14 ++-- .../apple/foundationdb/async/hnsw/HNSW.java | 68 ++++++++----------- .../async/hnsw/NodeReferenceAndNode.java | 2 +- .../async/hnsw/StorageAdapter.java | 2 +- .../record/metadata/IndexOptions.java | 44 +++++++----- .../VectorIndexScanComparisons.java | 8 +-- 6 files changed, 68 insertions(+), 70 deletions(-) diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/Config.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/Config.java index 931f256879..ac3aae3279 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/Config.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/Config.java @@ -50,7 +50,7 @@ public final class Config { public static final int DEFAULT_RABITQ_NUM_EX_BITS = 4; // concurrency public static final int DEFAULT_MAX_NUM_CONCURRENT_NODE_FETCHES = 16; - public static final int DEFAULT_MAX_NUM_CONCURRENT_NEIGHBOR_FETCHES = 16; + public static final int DEFAULT_MAX_NUM_CONCURRENT_NEIGHBOR_FETCHES = 10; public static final int DEFAULT_MAX_NUM_CONCURRENT_DELETE_FROM_LAYER = 2; @Nonnull @@ -102,11 +102,11 @@ private Config(@Nonnull final Metric metric, final int numDimensions, final bool Preconditions.checkArgument(maxNumConcurrentNodeFetches > 0 && maxNumConcurrentNodeFetches <= 64, "maxNumConcurrentNodeFetches must be (0, 64]"); Preconditions.checkArgument(maxNumConcurrentNeighborhoodFetches > 0 && - maxNumConcurrentNeighborhoodFetches <= 64, - "maxNumConcurrentNeighborhoodFetches must be (0, 64]"); + maxNumConcurrentNeighborhoodFetches <= 20, + "maxNumConcurrentNeighborhoodFetches must be (0, 20]"); Preconditions.checkArgument(maxNumConcurrentDeleteFromLayer > 0 && - maxNumConcurrentDeleteFromLayer <= 64, - "maxNumConcurrentDeleteFromLayer must be (0, 64]"); + maxNumConcurrentDeleteFromLayer <= 10, + "maxNumConcurrentDeleteFromLayer must be (0, 10]"); this.metric = metric; this.numDimensions = numDimensions; @@ -220,7 +220,7 @@ public int getEfConstruction() { /** * Maximum number of candidate nodes that are considered when a HNSW layer is locally repaired as part of a * delete operation. A smaller number causes the delete operation to create a smaller set of candidate nodes - * which improves repair performance but not decreases repair quality, a higher number results in qualitatively + * which improves repair performance but decreases repair quality; a higher number results in qualitatively * better repairs at the expense of slower performance. */ public int getEfRepair() { @@ -353,7 +353,7 @@ public int hashCode() { @Override @Nonnull public String toString() { - return "Config[" + "metric=" + getMetric() + ", numDimensions=" + getNumDimensions() + + return "Config[metric=" + getMetric() + ", numDimensions=" + getNumDimensions() + ", isUseInlining=" + isUseInlining() + ", M=" + getM() + ", MMax=" + getMMax() + ", MMax0=" + getMMax0() + ", efConstruction=" + getEfConstruction() + ", efRepair=" + getEfRepair() + ", isExtendCandidates=" + isExtendCandidates() + diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java index 1f853dade3..7ab65072f5 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java @@ -425,21 +425,16 @@ private CompletableFuture greedySearchInliningLayer(@ throw new IllegalStateException("unable to fetch node"); } final InliningNode candidateNode = node.asInliningNode(); - final List neighbors = candidateNode.getNeighbors(); - - if (!neighbors.isEmpty()) { - return CompletableFuture.completedFuture(candidateNode); - } if (updatedNodes.containsKey(candidateReference.getPrimaryKey())) { return CompletableFuture.completedFuture(updatedNodes.get(candidateReference.getPrimaryKey())); } return fetchBaseNode(readTransaction, storageTransform, candidateReference.getPrimaryKey()) - .thenApply(baseCompactNode -> { + .thenAccept(baseCompactNode -> { if (baseCompactNode == null) { // node does not exist on layer 0 - return null; + return; } // @@ -459,8 +454,8 @@ private CompletableFuture greedySearchInliningLayer(@ updatedNodes.put(candidateReference.getPrimaryKey(), nodeFactory.create(candidateReference.getPrimaryKey(), baseCompactNode.getVector(), candidateNode.getNeighbors())); - return null; - }); + }) + .thenApply(ignored -> null); }) .thenApply(candidateNode -> { @@ -997,28 +992,23 @@ public CompletableFuture insert(@Nonnull final Transaction transaction, @N return forEach(nodeReferenceAndNodes, nodeReferenceAndNode -> { final AbstractNode node = nodeReferenceAndNode.getNode(); - if (node.getNeighbors().isEmpty()) { - final NodeReferenceWithVector nodeReference = nodeReferenceAndNode.getNodeReference(); - return fetchBaseNode(readTransaction, storageTransform, nodeReference.getPrimaryKey()) - .thenApply(baseCompactNode -> { - if (baseCompactNode == null) { - return null; - } + final NodeReferenceWithVector nodeReference = nodeReferenceAndNode.getNodeReference(); + return fetchBaseNode(readTransaction, storageTransform, nodeReference.getPrimaryKey()) + .thenApply(baseCompactNode -> { + if (baseCompactNode == null) { + return null; + } - // - // The node does exist on layer 0 meaning the base node is a compact node, and we - // can use its vector going forward. This may be necessary if this is a dangling - // reference and the record has been reinserted after deletion. - // - final NodeReferenceWithVector updatedNodeReference = - new NodeReferenceWithVector(baseCompactNode.getPrimaryKey(), - baseCompactNode.getVector()); - return new NodeReferenceAndNode<>(updatedNodeReference, node); - }); - } else { - // this node has neighbors -- it must exist - return CompletableFuture.completedFuture(nodeReferenceAndNode); - } + // + // The node does exist on layer 0 meaning the base node is a compact node, and we + // can use its vector going forward. This may be necessary if this is a dangling + // reference and the record has been reinserted after deletion. + // + final NodeReferenceWithVector updatedNodeReference = + new NodeReferenceWithVector(baseCompactNode.getPrimaryKey(), + baseCompactNode.getVector()); + return new NodeReferenceAndNode<>(updatedNodeReference, node); + }); }, getConfig().getMaxNumConcurrentNodeFetches(), getExecutor()) @@ -1700,7 +1690,7 @@ extendedCandidates, layer, getConfig().getM(), nodeCache)) findNeighborReferences(@Nonnull final Collection> initialNodeReferenceAndNodes, @Nullable final SplittableRandom random, @Nonnull final CandidatePredicate candidatePredicate) { - final Set neighborReferences = Sets.newHashSet(); + final Set neighborReferences = Sets.newLinkedHashSet(); final ImmutableMap.Builder> initialNodesMapBuilder = ImmutableMap.builder(); for (final NodeReferenceAndNode nodeReferenceAndNode : initialNodeReferenceAndNodes) { initialNodesMapBuilder.put(nodeReferenceAndNode.getNode().getPrimaryKey(), nodeReferenceAndNode); @@ -1967,7 +1957,7 @@ private CompletableFuture> deleteFromLayers(@Nonnull fi final NodeReferenceAndNode toBeDeletedNodeReferenceAndNode = new NodeReferenceAndNode<>(new NodeReference(toBeDeletedPrimaryKey), toBeDeletedNode); - return findCandidates(storageAdapter, transaction, storageTransform, random, layer, + return findDeletionRepairCandidates(storageAdapter, transaction, storageTransform, random, layer, toBeDeletedNodeReferenceAndNode, nodeCache) .thenCompose(candidates -> { initializeCandidateChangeSetMap(toBeDeletedPrimaryKey, toBeDeletedNode, candidates, @@ -2146,13 +2136,13 @@ private void initializeCandidateChangeSetMap(@Nonnull */ @Nonnull private CompletableFuture>> - findCandidates(final @Nonnull StorageAdapter storageAdapter, - final @Nonnull Transaction transaction, - final @Nonnull StorageTransform storageTransform, - final @Nonnull SplittableRandom random, - final int layer, - final NodeReferenceAndNode toBeDeletedNodeReferenceAndNode, - final Map> nodeCache) { + findDeletionRepairCandidates(final @Nonnull StorageAdapter storageAdapter, + final @Nonnull Transaction transaction, + final @Nonnull StorageTransform storageTransform, + final @Nonnull SplittableRandom random, + final int layer, + final NodeReferenceAndNode toBeDeletedNodeReferenceAndNode, + final Map> nodeCache) { return neighbors(storageAdapter, transaction, storageTransform, random, ImmutableList.of(toBeDeletedNodeReferenceAndNode), ((r, initialNodeKeys, size, nodeReference) -> diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NodeReferenceAndNode.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NodeReferenceAndNode.java index 4e182c520a..7108624535 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NodeReferenceAndNode.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/NodeReferenceAndNode.java @@ -74,7 +74,7 @@ public AbstractNode getNode() { @Override public String toString() { - return "NB[" + nodeReference + "," + node + ']'; + return "NRaN[" + nodeReference + "," + node + ']'; } /** diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageAdapter.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageAdapter.java index b84f635356..b219fbdd6e 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageAdapter.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/StorageAdapter.java @@ -163,7 +163,7 @@ interface StorageAdapter { * layout and therefore the used {@link StorageAdapter}, the vector is either part of the reference * (when using {@link InliningStorageAdapter}) or is s part of the {@link AbstractNode} itself (when using * {@link CompactStorageAdapter}). This method hides that detail from the caller and correctly resolves the vector - * for bot use cases. + * for both use cases. * @param nodeReference a node reference * @param node the accompanying node to {@code nodeReference} * @return the associated vector as {@link Transformed} of {@link RealVector} diff --git a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/metadata/IndexOptions.java b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/metadata/IndexOptions.java index fe8a5de5ca..e00bb7a8d4 100644 --- a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/metadata/IndexOptions.java +++ b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/metadata/IndexOptions.java @@ -233,7 +233,7 @@ public class IndexOptions { /** * HNSW-only: The number of dimensions used. All vectors must have exactly this number of dimensions. This option * must be set when interacting with a vector index as it there is no default. - * See {@link Config#getNumDimensions()}. + * @see Config#getNumDimensions() */ public static final String HNSW_NUM_DIMENSIONS = "hnswNumDimensions"; @@ -242,7 +242,8 @@ public class IndexOptions { * persisted as a key/value pair per neighbor which includes the vectors of the neighbors but not for itself. If * inlining is not used, each node is persisted as exactly one key/value pair per node which stores its own vector * but specifically excludes the vectors of the neighbors. The default value is set to - * {@link Config#DEFAULT_USE_INLINING}. See {@link Config#isUseInlining()}. + * {@link Config#DEFAULT_USE_INLINING}. + * @see Config#isUseInlining() */ public static final String HNSW_USE_INLINING = "hnswUseInlining"; @@ -251,7 +252,7 @@ public class IndexOptions { * any layer. While by no means enforced or even enforceable, we strive to create and maintain exactly {@code m} * neighbors for a node. Due to insert/delete operations it is possible that the actual number of neighbors a node * references is not exactly {@code m} at any given time. The default value is set to {@link Config#DEFAULT_M}. - * See {@link Config#getM()}. + * @see Config#getM() */ public static final String HNSW_M = "hnswM"; @@ -260,7 +261,8 @@ public class IndexOptions { * stored on a layer greater than {@code 0}. A node can never have more that {@code mMax} neighbors. That means that * neighbors of a node are pruned if the actual number of neighbors would otherwise exceed {@code mMax}. Note that * this option must be greater than or equal to {@link #HNSW_M}. The default value is set to - * {@link Config#DEFAULT_M_MAX}. See {@link Config#getMMax()}. + * {@link Config#DEFAULT_M_MAX}. + * @see Config#getMMax() */ public static final String HNSW_M_MAX = "hnswMMax"; @@ -269,7 +271,8 @@ public class IndexOptions { * stored on layer {@code 0}. We will never create more that {@code mMax0} neighbors for a node that is stored on * that layer. That means that we even prune the neighbors of a node if the actual number of neighbors would * otherwise exceed {@code mMax0}. Note that this option must be greater than or equal to {@link #HNSW_M_MAX}. - * The default value is set to {@link Config#DEFAULT_M_MAX_0}. See {@link Config#getMMax0()}. + * The default value is set to {@link Config#DEFAULT_M_MAX_0}. + * @see Config#getMMax0() */ public static final String HNSW_M_MAX_0 = "hnswMMax0"; @@ -278,16 +281,17 @@ public class IndexOptions { * of a new node. If {@code HNSW_EF_CONSTRUCTION} is set to {@code 1}, the search naturally follows a greedy * approach (monotonous descent), whereas a high number for {@code HNSW_EF_CONSTRUCTION} allows for a more nuanced * search that can tolerate (false) local minima. The default value is set to {@link Config#DEFAULT_EF_CONSTRUCTION}. - * See {@link Config#getEfConstruction()}. + * @see Config#getEfConstruction() */ public static final String HNSW_EF_CONSTRUCTION = "hnswEfConstruction"; /** * HNSW-only: Maximum number of candidate nodes that are considered when a HNSW layer is locally repaired as part of * a delete operation. A smaller number causes the delete operation to create a smaller set of candidate nodes - * which improves repair performance but not decreases repair quality, a higher number results in qualitatively + * which improves repair performance but decreases repair quality; a higher number results in qualitatively * better repairs at the expense of slower performance. - * The default value is set to {@link Config#DEFAULT_EF_REPAIR}. See {@link Config#getEfRepair()}. + * The default value is set to {@link Config#DEFAULT_EF_REPAIR}. + * @see Config#getEfRepair() */ public static final String HNSW_EF_REPAIR = "hnswEfRepair"; @@ -295,7 +299,7 @@ public class IndexOptions { * HNSW-only: Indicator to signal if, during the insertion of a node, the set of nearest neighbors of that node is * to be extended by the actual neighbors of those neighbors to form a set of candidates that the new node may be * connected to during the insert operation. The default value is set to {@link Config#DEFAULT_EXTEND_CANDIDATES}. - * See {@link Config#isExtendCandidates()}. + * @see Config#isExtendCandidates() */ public static final String HNSW_EXTEND_CANDIDATES = "hnswExtendCandidates"; @@ -303,7 +307,8 @@ public class IndexOptions { * HNSW-only: Indicator to signal if, during the insertion of a node, candidates that have been discarded due to not * satisfying the select-neighbor heuristic may get added back in to pad the set of neighbors if the new node would * otherwise have too few neighbors (see {@link Config#getM()}). The default value is set to - * {@link Config#DEFAULT_KEEP_PRUNED_CONNECTIONS}. See {@link Config#isKeepPrunedConnections()}. + * {@link Config#DEFAULT_KEEP_PRUNED_CONNECTIONS}. + * @see Config#isKeepPrunedConnections() */ public static final String HNSW_KEEP_PRUNED_CONNECTIONS = "hnswKeepPrunedConnections"; @@ -312,7 +317,7 @@ public class IndexOptions { * represents the probability of a vector being inserted to also be written into the samples subspace of the hnsw * structure. The vectors in that subspace are continuously aggregated until a total {@link #HNSW_STATS_THRESHOLD} * has been reached. The default value is set to {@link Config#DEFAULT_SAMPLE_VECTOR_STATS_PROBABILITY}. See - * {@link Config#getSampleVectorStatsProbability()}. + * @see Config#getSampleVectorStatsProbability() */ public static final String HNSW_SAMPLE_VECTOR_STATS_PROBABILITY = "hnswSampleVectorStatsProbability"; @@ -321,7 +326,8 @@ public class IndexOptions { * represents the probability of the samples subspace to be further aggregated (rolled-up) when a new vector is * inserted. The vectors in that subspace are continuously aggregated until a total * {@link #HNSW_STATS_THRESHOLD} has been reached. The default value is set to - * {@link Config#DEFAULT_MAINTAIN_STATS_PROBABILITY}. See {@link Config#getMaintainStatsProbability()}. + * {@link Config#DEFAULT_MAINTAIN_STATS_PROBABILITY}. + * @see Config#getMaintainStatsProbability() */ public static final String HNSW_MAINTAIN_STATS_PROBABILITY = "hnswMaintainStatsProbability"; @@ -330,14 +336,15 @@ public class IndexOptions { * represents the threshold (being a number of vectors) that when reached causes the stats maintenance logic to * compute the actual statistics (currently the centroid of the vectors that have been inserted to far). The result * is then inserted into the access info subspace of the index. The default value is set to - * {@link Config#DEFAULT_STATS_THRESHOLD}. See {@link Config#getStatsThreshold()}. + * {@link Config#DEFAULT_STATS_THRESHOLD}. + * @see Config#getStatsThreshold() */ public static final String HNSW_STATS_THRESHOLD = "hnswStatsThreshold"; /** * HNSW-only: Indicator if we should RaBitQ quantization. See {@link com.apple.foundationdb.rabitq.RaBitQuantizer} * for more details. The default value is set to {@link Config#DEFAULT_USE_RABITQ}. - * See {@link Config#isUseRaBitQ()}. + * @see Config#isUseRaBitQ() */ public static final String HNSW_USE_RABITQ = "hnswUseRaBitQ"; @@ -345,28 +352,29 @@ public class IndexOptions { * HNSW-only: Number of bits per dimensions iff {@link #HNSW_USE_RABITQ} is set to {@code "true"}, ignored * otherwise. If RaBitQ encoding is used, a vector is stored using roughly * {@code 25 + numDimensions * (numExBits + 1) / 8} bytes. The default value is set to - * {@link Config#DEFAULT_RABITQ_NUM_EX_BITS}. See {@link Config#getRaBitQNumExBits()}. + * {@link Config#DEFAULT_RABITQ_NUM_EX_BITS}. + * @see Config#getRaBitQNumExBits() */ public static final String HNSW_RABITQ_NUM_EX_BITS = "hnswRaBitQNumExBits"; /** * HNSW-only: Maximum number of concurrent node fetches during search and modification operations. The default value * is set to {@link Config#DEFAULT_MAX_NUM_CONCURRENT_NODE_FETCHES}. - * See {@link Config#getMaxNumConcurrentNodeFetches()}. + * @see Config#getMaxNumConcurrentNodeFetches() */ public static final String HNSW_MAX_NUM_CONCURRENT_NODE_FETCHES = "hnswMaxNumConcurrentNodeFetches"; /** * HNSW-only: Maximum number of concurrent neighborhood fetches during modification operations when the neighbors * are pruned. The default value is set to {@link Config#DEFAULT_MAX_NUM_CONCURRENT_NEIGHBOR_FETCHES}. - * See {@link Config#getMaxNumConcurrentNeighborhoodFetches()}. + * @see Config#getMaxNumConcurrentNeighborhoodFetches() */ public static final String HNSW_MAX_NUM_CONCURRENT_NEIGHBORHOOD_FETCHES = "hnswMaxNumConcurrentNeighborhoodFetches"; /** * HNSW-only: Maximum number of delete operations that can run concurrently in separate layers during the deletion * of a record. The default value is set to {@link Config#DEFAULT_MAX_NUM_CONCURRENT_DELETE_FROM_LAYER}. - * See {@link Config#getMaxNumConcurrentDeleteFromLayer()}. + * @see Config#getMaxNumConcurrentDeleteFromLayer() */ public static final String HNSW_MAX_NUM_CONCURRENT_DELETE_FROM_LAYER = "hnswMaxNumConcurrentDeleteFromLayer"; diff --git a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/VectorIndexScanComparisons.java b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/VectorIndexScanComparisons.java index bee821db00..83f4fff756 100644 --- a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/VectorIndexScanComparisons.java +++ b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/VectorIndexScanComparisons.java @@ -54,7 +54,7 @@ * {@link ScanComparisons} for use in a multidimensional index scan. */ @API(API.Status.UNSTABLE) -public class VectorIndexScanComparisons implements IndexScanParameters { +public final class VectorIndexScanComparisons implements IndexScanParameters { @Nonnull private final ScanComparisons prefixScanComparisons; @Nonnull @@ -261,9 +261,9 @@ public IndexScanParameters translateCorrelations(@Nonnull final TranslationMap t } @Nonnull - protected VectorIndexScanComparisons withComparisonsAndOptions(@Nonnull final ScanComparisons prefixScanComparisons, - @Nonnull final DistanceRankValueComparison distanceRankValueComparison, - @Nonnull final VectorIndexScanOptions vectorIndexScanOptions) { + VectorIndexScanComparisons withComparisonsAndOptions(@Nonnull final ScanComparisons prefixScanComparisons, + @Nonnull final DistanceRankValueComparison distanceRankValueComparison, + @Nonnull final VectorIndexScanOptions vectorIndexScanOptions) { return new VectorIndexScanComparisons(prefixScanComparisons, distanceRankValueComparison, vectorIndexScanOptions); } From 44c943c7b98a3e01c948e7764dfce6075d1a142d Mon Sep 17 00:00:00 2001 From: Normen Seemann Date: Wed, 17 Dec 2025 14:32:02 +0100 Subject: [PATCH 17/17] addressing more comments --- .../async/hnsw/DeleteNeighborsChangeSet.java | 6 +++++- .../apple/foundationdb/async/hnsw/HNSW.java | 18 +++++++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/DeleteNeighborsChangeSet.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/DeleteNeighborsChangeSet.java index 0bdd1eb3dd..012ea9e54d 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/DeleteNeighborsChangeSet.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/DeleteNeighborsChangeSet.java @@ -85,7 +85,11 @@ public NeighborsChangeSet getParent() { @Override public boolean hasChanges() { - // We can probably do better by testing if the deletion has an effect on the merge. + // + // We can probably do better by testing if the deletion has an effect on the merge, i.e. if the neighbors that + // are being deleted by this set are in fact part of the underlying set. That case is currently impossible so + // we just return true for now. + // return true; } diff --git a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java index 7ab65072f5..15fd1fa02f 100644 --- a/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java +++ b/fdb-extensions/src/main/java/com/apple/foundationdb/async/hnsw/HNSW.java @@ -455,7 +455,7 @@ private CompletableFuture greedySearchInliningLayer(@ nodeFactory.create(candidateReference.getPrimaryKey(), baseCompactNode.getVector(), candidateNode.getNeighbors())); }) - .thenApply(ignored -> null); + .thenApply(ignored -> null); // keep Java happy about the return type }) .thenApply(candidateNode -> { @@ -2456,10 +2456,26 @@ static void scanLayer(@Nonnull final Config config, : new CompactStorageAdapter(config, CompactNode.factory(), subspace, onWriteListener, onReadListener); } + /** + * Returns a good double hash code for the argument of type {@code long}. It uses {@link #splitMixLong(long)} + * internally and then maps the {@code long} result to a {@code double} between {@code 0} and {@code 1}. + * This method is directly used in {@link #topLayer(Tuple)} to determine the top layer of a record given its + * primary key. + * @param x a {@code long} + * @return a high quality hash code of {@code x} as a {@code double} in the range {@code [0.0d, 1.0d)}. + */ private static double splitMixDouble(final long x) { return (splitMixLong(x) >>> 11) * 0x1.0p-53; } + /** + * Returns a good long hash code for the argument of type {@code long}. It is an implementation of the + * output mixing function {@code SplitMix64} as employed by many PRNG such as {@link SplittableRandom}. + * See Linear congruential generator for + * more information. + * @param x a {@code long} + * @return a high quality hash code of {@code x} + */ private static long splitMixLong(long x) { x += 0x9e3779b97f4a7c15L; x = (x ^ (x >>> 30)) * 0xbf58476d1ce4e5b9L;