From 8a362a0359dc09a7a9be1748f37b8cdc42b365fd Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 5 Dec 2024 18:28:00 +0100 Subject: [PATCH 1/4] Also make joins between two index scans individually observable. Signed-off-by: Johannes Kalmbach --- src/engine/IndexScan.cpp | 78 +++++++++++++++++++++++++++++-- src/engine/IndexScan.h | 15 +++++- src/engine/Join.cpp | 67 +++++++++----------------- src/engine/Join.h | 9 ++-- src/engine/Operation.h | 6 +++ src/engine/QueryExecutionTree.cpp | 15 ++++++ src/engine/QueryExecutionTree.h | 4 ++ src/index/CompressedRelation.h | 2 +- 8 files changed, 140 insertions(+), 56 deletions(-) diff --git a/src/engine/IndexScan.cpp b/src/engine/IndexScan.cpp index f56123a42b..54bb1a2dd0 100644 --- a/src/engine/IndexScan.cpp +++ b/src/engine/IndexScan.cpp @@ -204,15 +204,25 @@ std::shared_ptr IndexScan::makeCopyWithAddedPrefilters( } // _____________________________________________________________________________ -Result::Generator IndexScan::chunkedIndexScan() const { +Result::Generator IndexScan::chunkedIndexScan() { auto optBlockSpan = getBlockMetadata(); if (!optBlockSpan.has_value()) { co_return; } const auto& blockSpan = optBlockSpan.value(); + size_t numBlocksAll = blockSpan.size(); // Note: Given a `PrefilterIndexPair` is available, the corresponding // prefiltering will be applied in `getLazyScan`. - for (IdTable& idTable : getLazyScan({blockSpan.begin(), blockSpan.end()})) { + auto innerGenerator = getLazyScan({blockSpan.begin(), blockSpan.end()}); + auto setDetails = ad_utility::makeOnDestructionDontThrowDuringStackUnwinding( + [ + + this, numBlocksAll, &innerGenerator]() { + auto details = innerGenerator.details(); + details.numBlocksAll_ = numBlocksAll; + updateRuntimeInfoForLazyScan(details); + }); + for (IdTable& idTable : innerGenerator) { co_yield {std::move(idTable), LocalVocab{}}; } } @@ -339,7 +349,14 @@ IndexScan::getBlockMetadata() const { // _____________________________________________________________________________ std::optional> IndexScan::getBlockMetadataOptionallyPrefiltered() const { - auto optBlockSpan = getBlockMetadata(); + auto optBlockSpan = + [&]() -> std::optional> { + if (prefilteredBlocks_.has_value()) { + return prefilteredBlocks_.value(); + } else { + return getBlockMetadata(); + } + }(); std::optional> optBlocks = std::nullopt; if (optBlockSpan.has_value()) { const auto& blockSpan = optBlockSpan.value(); @@ -389,6 +406,7 @@ std::optional IndexScan::getMetadataForScan() }; // _____________________________________________________________________________ +// TODO This can be removed now. std::array IndexScan::lazyScanForJoinOfTwoScans(const IndexScan& s1, const IndexScan& s2) { AD_CONTRACT_CHECK(s1.numVariables_ <= 3 && s2.numVariables_ <= 3); @@ -656,3 +674,57 @@ std::pair IndexScan::prefilterTables( return {createPrefilteredJoinSide(state), createPrefilteredIndexScanSide(state)}; } + +// _____________________________________________________________________________ +void IndexScan::setBlocksForJoinOfIndexScans(Operation* left, + Operation* right) { + auto& leftScan = dynamic_cast(*left); + auto& rightScan = dynamic_cast(*right); + + auto getBlocks = [](IndexScan& scan) { + auto metaBlocks = scan.getMetadataForScan(); + if (!metaBlocks.has_value()) { + return metaBlocks; + } + if (scan.prefilteredBlocks_.has_value()) { + metaBlocks.value().blockMetadata_ = scan.prefilteredBlocks_.value(); + } + return metaBlocks; + }; + + auto metaBlocks1 = getBlocks(leftScan); + auto metaBlocks2 = getBlocks(rightScan); + if (!metaBlocks1.has_value() || !metaBlocks2.has_value()) { + return; + } + auto [blocks1, blocks2] = CompressedRelationReader::getBlocksForJoin( + metaBlocks1.value(), metaBlocks2.value()); + leftScan.prefilteredBlocks_ = std::move(blocks1); + rightScan.prefilteredBlocks_ = std::move(blocks2); +} + +// _____________________________________________________________________________ +std::vector IndexScan::getIndexScansForSortVariables( + std::span variables) { + const auto& sorted = resultSortedOn(); + if (resultSortedOn().size() < variables.size()) { + return {}; + } + const auto& varColMap = getExternallyVisibleVariableColumns(); + for (size_t i = 0; i < variables.size(); ++i) { + auto it = varColMap.find(variables[i]); + if (it == varColMap.end() || + it->second.columnIndex_ != resultSortedOn().at(i)) { + return {}; + } + } + return {this}; +} + +// _____________________________________________________________________________ +void IndexScan::setPrefilteredBlocks( + std::vector prefilteredBlocks) { + prefilteredBlocks_ = std::move(prefilteredBlocks); + // TODO once the other PR is merged we have to assert that the result + // is never cached AD_CORRECTNESS_CHECK(!canBeStoredInCache()); +} diff --git a/src/engine/IndexScan.h b/src/engine/IndexScan.h index c10680f59e..fa19037e76 100644 --- a/src/engine/IndexScan.h +++ b/src/engine/IndexScan.h @@ -36,6 +36,9 @@ class IndexScan final : public Operation { std::vector additionalColumns_; std::vector additionalVariables_; + // TODO Comment + std::optional> prefilteredBlocks_; + public: IndexScan(QueryExecutionContext* qec, Permutation::Enum permutation, const SparqlTriple& triple, Graphs graphsToFilter = std::nullopt, @@ -108,6 +111,9 @@ class IndexScan final : public Operation { std::pair prefilterTables( Result::Generator input, ColumnIndex joinColumn); + // TODO Comment + static void setBlocksForJoinOfIndexScans(Operation* left, Operation* right); + private: // Implementation detail that allows to consume a generator from two other // cooperating generators. Needs to be forward declared as it is used by @@ -202,7 +208,7 @@ class IndexScan final : public Operation { PrefilterIndexPair prefilter) const; // Return the (lazy) `IdTable` for this `IndexScan` in chunks. - Result::Generator chunkedIndexScan() const; + Result::Generator chunkedIndexScan(); // Get the `IdTable` for this `IndexScan` in one piece. IdTable materializedIndexScan() const; @@ -234,4 +240,11 @@ class IndexScan final : public Operation { Permutation::IdTableGenerator getLazyScan( std::vector blocks) const; std::optional getMetadataForScan() const; + + // TODO Comment. + void setPrefilteredBlocks( + std::vector prefilteredBlocks); + + std::vector getIndexScansForSortVariables( + std::span variables) override; }; diff --git a/src/engine/Join.cpp b/src/engine/Join.cpp index d3c5370e16..9212b6d7d6 100644 --- a/src/engine/Join.cpp +++ b/src/engine/Join.cpp @@ -181,6 +181,16 @@ ProtoResult Join::computeResult(bool requestLaziness) { auto rightResIfCached = getCachedOrSmallResult(*_right); checkCancellation(); + // TODO Copy and move to separate function. + std::span joinVarSpan{&_joinVar, 1}; + auto leftIndexScans = _left->getIndexScansForSortVariables(joinVarSpan); + auto rightIndexScans = _right->getIndexScansForSortVariables(joinVarSpan); + for (auto* left : leftIndexScans) { + for (auto* right : rightIndexScans) { + IndexScan::setBlocksForJoinOfIndexScans(left, right); + } + } + auto leftIndexScan = std::dynamic_pointer_cast(_left->getRootOperation()); if (leftIndexScan && @@ -189,9 +199,6 @@ ProtoResult Join::computeResult(bool requestLaziness) { AD_CORRECTNESS_CHECK(rightResIfCached->isFullyMaterialized()); return computeResultForIndexScanAndIdTable( requestLaziness, std::move(rightResIfCached), leftIndexScan); - - } else if (!leftResIfCached) { - return computeResultForTwoIndexScans(requestLaziness); } } @@ -216,9 +223,10 @@ ProtoResult Join::computeResult(bool requestLaziness) { if (leftRes->isFullyMaterialized()) { return computeResultForIndexScanAndIdTable( requestLaziness, std::move(leftRes), rightIndexScan); + } else if (!leftIndexScan) { + return computeResultForIndexScanAndLazyOperation( + requestLaziness, std::move(leftRes), rightIndexScan); } - return computeResultForIndexScanAndLazyOperation( - requestLaziness, std::move(leftRes), rightIndexScan); } std::shared_ptr rightRes = @@ -647,47 +655,6 @@ void Join::addCombinedRowToIdTable(const ROW_A& rowA, const ROW_B& rowB, } } -// ______________________________________________________________________________________________________ -ProtoResult Join::computeResultForTwoIndexScans(bool requestLaziness) const { - return createResult( - requestLaziness, - [this](std::function yieldTable) { - auto leftScan = - std::dynamic_pointer_cast(_left->getRootOperation()); - auto rightScan = - std::dynamic_pointer_cast(_right->getRootOperation()); - AD_CORRECTNESS_CHECK(leftScan && rightScan); - // The join column already is the first column in both inputs, so we - // don't have to permute the inputs and results for the - // `AddCombinedRowToIdTable` class to work correctly. - AD_CORRECTNESS_CHECK(_leftJoinCol == 0 && _rightJoinCol == 0); - auto rowAdder = makeRowAdder(std::move(yieldTable)); - - ad_utility::Timer timer{ - ad_utility::timer::Timer::InitialStatus::Started}; - auto [leftBlocksInternal, rightBlocksInternal] = - IndexScan::lazyScanForJoinOfTwoScans(*leftScan, *rightScan); - runtimeInfo().addDetail("time-for-filtering-blocks", timer.msecs()); - - auto leftBlocks = convertGenerator(std::move(leftBlocksInternal)); - auto rightBlocks = convertGenerator(std::move(rightBlocksInternal)); - - ad_utility::zipperJoinForBlocksWithoutUndef(leftBlocks, rightBlocks, - std::less{}, rowAdder); - - leftScan->updateRuntimeInfoForLazyScan(leftBlocks.details()); - rightScan->updateRuntimeInfoForLazyScan(rightBlocks.details()); - - AD_CORRECTNESS_CHECK(leftBlocks.details().numBlocksRead_ <= - rightBlocks.details().numElementsRead_); - AD_CORRECTNESS_CHECK(rightBlocks.details().numBlocksRead_ <= - leftBlocks.details().numElementsRead_); - auto localVocab = std::move(rowAdder.localVocab()); - return Result::IdTableVocabPair{std::move(rowAdder).resultTable(), - std::move(localVocab)}; - }); -} - // ______________________________________________________________________________________________________ template ProtoResult Join::computeResultForIndexScanAndIdTable( @@ -834,3 +801,11 @@ ad_utility::AddCombinedRowToIdTable Join::makeRowAdder( 1, IdTable{getResultWidth(), allocator()}, cancellationHandle_, CHUNK_SIZE, std::move(callback)}; } +// _____________________________________________________________________________ +std::vector Join::getIndexScansForSortVariables( + std::span variables) { + auto result = _left->getIndexScansForSortVariables(variables); + auto right = _right->getIndexScansForSortVariables(variables); + result.insert(result.end(), right.begin(), right.end()); + return result; +} diff --git a/src/engine/Join.h b/src/engine/Join.h index 8c8978c8d3..823177cfd8 100644 --- a/src/engine/Join.h +++ b/src/engine/Join.h @@ -141,6 +141,10 @@ class Join : public Operation { static void hashJoin(const IdTable& dynA, ColumnIndex jc1, const IdTable& dynB, ColumnIndex jc2, IdTable* dynRes); + // TODO Comment. + std::vector getIndexScansForSortVariables( + std::span variables) override; + protected: virtual string getCacheKeyImpl() const override; @@ -149,11 +153,6 @@ class Join : public Operation { VariableToColumnMap computeVariableToColumnMap() const override; - // A special implementation that is called when both children are - // `IndexScan`s. Uses the lazy scans to only retrieve the subset of the - // `IndexScan`s that is actually needed without fully materializing them. - ProtoResult computeResultForTwoIndexScans(bool requestLaziness) const; - // A special implementation that is called when exactly one of the children is // an `IndexScan` and the other one is a fully materialized result. The // argument `idTableIsRightInput` determines whether the `IndexScan` is the diff --git a/src/engine/Operation.h b/src/engine/Operation.h index 3e06a9498e..6cebffc0b5 100644 --- a/src/engine/Operation.h +++ b/src/engine/Operation.h @@ -426,6 +426,12 @@ class Operation { RuntimeInformation::Status status = RuntimeInformation::Status::optimizedOut); + // TODO Comment. + virtual std::vector getIndexScansForSortVariables( + [[maybe_unused]] std::span variables) { + return {}; + } + private: // Create the runtime information in case the evaluation of this operation has // failed. diff --git a/src/engine/QueryExecutionTree.cpp b/src/engine/QueryExecutionTree.cpp index c9496fe958..784729fb67 100644 --- a/src/engine/QueryExecutionTree.cpp +++ b/src/engine/QueryExecutionTree.cpp @@ -225,3 +225,18 @@ QueryExecutionTree::getVariableAndInfoByColumnIndex(ColumnIndex colIdx) const { AD_CONTRACT_CHECK(it != varColMap.end()); return *it; } + +// _____________________________________________________________________________ +std::vector QueryExecutionTree::getIndexScansForSortVariables( + std::span variables) { + auto result = rootOperation_->getIndexScansForSortVariables(variables); + if (result.empty()) { + return result; + } + // TODO We have to disable the caching as soon as the PR for that is + // merged. + // rootOperation_->disableCaching(); + cachedResult_.reset(); + sizeEstimate_.reset(); + return result; +} diff --git a/src/engine/QueryExecutionTree.h b/src/engine/QueryExecutionTree.h index 0eac785f16..b93a6b1a1f 100644 --- a/src/engine/QueryExecutionTree.h +++ b/src/engine/QueryExecutionTree.h @@ -122,6 +122,10 @@ class QueryExecutionTree { return rootOperation_->collectWarnings(); } + // TODO Comment. + virtual std::vector getIndexScansForSortVariables( + std::span variables) final; + template void forAllDescendants(F f) { static_assert( diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h index 36cbf14af0..7f20527ad9 100644 --- a/src/index/CompressedRelation.h +++ b/src/index/CompressedRelation.h @@ -466,7 +466,7 @@ class CompressedRelationReader { // to be performed. struct ScanSpecAndBlocks { ScanSpecification scanSpec_; - const std::span blockMetadata_; + std::span blockMetadata_; }; // This struct additionally contains the first and last triple of the scan From 90383885a67df9087ec6ff91e3ed1647130ce80b Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Tue, 17 Dec 2024 11:38:28 +0100 Subject: [PATCH 2/4] Some improvements. Signed-off-by: Johannes Kalmbach --- src/engine/IndexScan.cpp | 27 +++++++++++++++++++-------- src/engine/IndexScan.h | 4 +++- src/engine/Join.cpp | 31 +++++++++++++++++++++++-------- src/engine/Join.h | 5 +++-- src/engine/Operation.h | 12 +++++++++--- src/engine/QueryExecutionTree.cpp | 16 +++++++++++++--- src/engine/QueryExecutionTree.h | 19 ++++++++++++++++--- 7 files changed, 86 insertions(+), 28 deletions(-) diff --git a/src/engine/IndexScan.cpp b/src/engine/IndexScan.cpp index 54bb1a2dd0..84c7e52713 100644 --- a/src/engine/IndexScan.cpp +++ b/src/engine/IndexScan.cpp @@ -704,21 +704,32 @@ void IndexScan::setBlocksForJoinOfIndexScans(Operation* left, } // _____________________________________________________________________________ -std::vector IndexScan::getIndexScansForSortVariables( - std::span variables) { +bool IndexScan::hasIndexScansForJoinPrefiltering( + std::span joinVariables) const { const auto& sorted = resultSortedOn(); - if (resultSortedOn().size() < variables.size()) { - return {}; + if (resultSortedOn().size() < joinVariables.size()) { + return false; } const auto& varColMap = getExternallyVisibleVariableColumns(); - for (size_t i = 0; i < variables.size(); ++i) { - auto it = varColMap.find(variables[i]); + for (size_t i = 0; i < joinVariables.size(); ++i) { + auto it = varColMap.find(joinVariables[i]); if (it == varColMap.end() || it->second.columnIndex_ != resultSortedOn().at(i)) { - return {}; + return false; } } - return {this}; + return true; +} + +// _____________________________________________________________________________ +std::vector +IndexScan::getIndexScansForJoinPrefilteringAndDisableCaching( + std::span variables) { + if (hasIndexScansForJoinPrefiltering(variables)) { + return {this}; + } else { + return {}; + } } // _____________________________________________________________________________ diff --git a/src/engine/IndexScan.h b/src/engine/IndexScan.h index fa19037e76..5e67f0788c 100644 --- a/src/engine/IndexScan.h +++ b/src/engine/IndexScan.h @@ -245,6 +245,8 @@ class IndexScan final : public Operation { void setPrefilteredBlocks( std::vector prefilteredBlocks); - std::vector getIndexScansForSortVariables( + bool hasIndexScansForJoinPrefiltering( + std::span joinVariables) const override; + std::vector getIndexScansForJoinPrefilteringAndDisableCaching( std::span variables) override; }; diff --git a/src/engine/Join.cpp b/src/engine/Join.cpp index 9212b6d7d6..e6516e9f53 100644 --- a/src/engine/Join.cpp +++ b/src/engine/Join.cpp @@ -183,11 +183,16 @@ ProtoResult Join::computeResult(bool requestLaziness) { // TODO Copy and move to separate function. std::span joinVarSpan{&_joinVar, 1}; - auto leftIndexScans = _left->getIndexScansForSortVariables(joinVarSpan); - auto rightIndexScans = _right->getIndexScansForSortVariables(joinVarSpan); - for (auto* left : leftIndexScans) { - for (auto* right : rightIndexScans) { - IndexScan::setBlocksForJoinOfIndexScans(left, right); + if (_left->hasIndexScansForJoinPrefiltering(joinVarSpan) && + _right->hasIndexScansForJoinPrefiltering(joinVarSpan)) { + auto leftIndexScans = + _left->getIndexScansForJoinPrefilteringAndDisableCaching(joinVarSpan); + auto rightIndexScans = + _right->getIndexScansForJoinPrefilteringAndDisableCaching(joinVarSpan); + for (auto* left : leftIndexScans) { + for (auto* right : rightIndexScans) { + IndexScan::setBlocksForJoinOfIndexScans(left, right); + } } } @@ -801,11 +806,21 @@ ad_utility::AddCombinedRowToIdTable Join::makeRowAdder( 1, IdTable{getResultWidth(), allocator()}, cancellationHandle_, CHUNK_SIZE, std::move(callback)}; } + +// _____________________________________________________________________________ +bool Join::hasIndexScansForJoinPrefiltering( + std::span variables) const { + return _left->hasIndexScansForJoinPrefiltering(variables) || + _right->hasIndexScansForJoinPrefiltering(variables); +} + // _____________________________________________________________________________ -std::vector Join::getIndexScansForSortVariables( +std::vector Join::getIndexScansForJoinPrefilteringAndDisableCaching( std::span variables) { - auto result = _left->getIndexScansForSortVariables(variables); - auto right = _right->getIndexScansForSortVariables(variables); + auto result = + _left->getIndexScansForJoinPrefilteringAndDisableCaching(variables); + auto right = + _right->getIndexScansForJoinPrefilteringAndDisableCaching(variables); result.insert(result.end(), right.begin(), right.end()); return result; } diff --git a/src/engine/Join.h b/src/engine/Join.h index 823177cfd8..1502cae819 100644 --- a/src/engine/Join.h +++ b/src/engine/Join.h @@ -141,8 +141,9 @@ class Join : public Operation { static void hashJoin(const IdTable& dynA, ColumnIndex jc1, const IdTable& dynB, ColumnIndex jc2, IdTable* dynRes); - // TODO Comment. - std::vector getIndexScansForSortVariables( + bool hasIndexScansForJoinPrefiltering( + std::span variables) const override; + std::vector getIndexScansForJoinPrefilteringAndDisableCaching( std::span variables) override; protected: diff --git a/src/engine/Operation.h b/src/engine/Operation.h index 6cebffc0b5..f1321bc1c5 100644 --- a/src/engine/Operation.h +++ b/src/engine/Operation.h @@ -426,9 +426,15 @@ class Operation { RuntimeInformation::Status status = RuntimeInformation::Status::optimizedOut); - // TODO Comment. - virtual std::vector getIndexScansForSortVariables( - [[maybe_unused]] std::span variables) { + // See the functions with the same name in `QueryExecutionTree.h` for + // documentation. + virtual bool hasIndexScansForJoinPrefiltering( + [[maybe_unused]] std::span joinVariables) const { + return false; + } + virtual std::vector + getIndexScansForJoinPrefilteringAndDisableCaching( + [[maybe_unused]] std::span joinVariables) { return {}; } diff --git a/src/engine/QueryExecutionTree.cpp b/src/engine/QueryExecutionTree.cpp index 784729fb67..6cd0a84038 100644 --- a/src/engine/QueryExecutionTree.cpp +++ b/src/engine/QueryExecutionTree.cpp @@ -13,6 +13,7 @@ #include "engine/Sort.h" #include "parser/RdfEscaping.h" +#include "util/http/UrlParser.h" using std::string; @@ -227,9 +228,18 @@ QueryExecutionTree::getVariableAndInfoByColumnIndex(ColumnIndex colIdx) const { } // _____________________________________________________________________________ -std::vector QueryExecutionTree::getIndexScansForSortVariables( - std::span variables) { - auto result = rootOperation_->getIndexScansForSortVariables(variables); +bool QueryExecutionTree::hasIndexScansForJoinPrefiltering( + std::span joinVariables) const { + return rootOperation_->hasIndexScansForJoinPrefiltering(joinVariables); +} + +// _____________________________________________________________________________ +std::vector +QueryExecutionTree::getIndexScansForJoinPrefilteringAndDisableCaching( + std::span joinVariables) { + auto result = + rootOperation_->getIndexScansForJoinPrefilteringAndDisableCaching( + joinVariables); if (result.empty()) { return result; } diff --git a/src/engine/QueryExecutionTree.h b/src/engine/QueryExecutionTree.h index b93a6b1a1f..739c684b65 100644 --- a/src/engine/QueryExecutionTree.h +++ b/src/engine/QueryExecutionTree.h @@ -122,9 +122,22 @@ class QueryExecutionTree { return rootOperation_->collectWarnings(); } - // TODO Comment. - virtual std::vector getIndexScansForSortVariables( - std::span variables) final; + // The following functions are used if the ExecutionTree is the child or a + // descendant of a JOIN operation. They look for IndexScans in the subtree + // that are eligible for the block prefiltering of joins because + // 1. They are sorted by the `joinVariables` + // 2. They are semantically eligible for the join prefiltering (this doesn't + // hold for example for the right hand side of a MINUS) + + // This function returns true iff at least on eligible `IndexScan` is + // contained In the subtree. + bool hasIndexScansForJoinPrefiltering( + std::span joinVariables) const; + + // This function returns all eligible `IndexScan`s and disables caching for + // them, because they will probably be modified afterward. + std::vector getIndexScansForJoinPrefilteringAndDisableCaching( + std::span joinVariables); template void forAllDescendants(F f) { From d4ea7063fefab758e823020fb568bbfa3fc1e5b7 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 18 Dec 2024 17:52:15 +0100 Subject: [PATCH 3/4] Caching bugs in the test. Signed-off-by: Johannes Kalmbach --- test/engine/IndexScanTest.cpp | 38 +++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/test/engine/IndexScanTest.cpp b/test/engine/IndexScanTest.cpp index 62f01647a0..b067e1b512 100644 --- a/test/engine/IndexScanTest.cpp +++ b/test/engine/IndexScanTest.cpp @@ -25,6 +25,13 @@ using LazyResult = Result::LazyResult; using IndexPair = std::pair; +// TODO Comment +Permutation::IdTableGenerator convertGenerator(Result::LazyResult gen) { + for (auto& [idTable, localVocab] : gen) { + co_yield idTable; + } +} + // NOTE: All the following helper functions always use the `PSO` permutation to // set up index scans unless explicitly stated otherwise. @@ -104,20 +111,29 @@ void testLazyScanForJoinOfTwoScans( std::vector limits{{}, {12, 3}, {2, 3}}; for (const auto& limit : limits) { auto qec = getQec(kgTurtle, true, true, true, blocksizePermutations); + qec->getQueryTreeCache().clearAll(); IndexScan s1{qec, Permutation::PSO, tripleLeft}; + IndexScan s1Copy{qec, Permutation::PSO, tripleLeft}; s1.setLimit(limit); IndexScan s2{qec, Permutation::PSO, tripleRight}; - auto implForSwitch = [](IndexScan& l, IndexScan& r, const auto& expectedL, - const auto& expectedR, - const LimitOffsetClause& limitL, - const LimitOffsetClause& limitR) { - auto [scan1, scan2] = (IndexScan::lazyScanForJoinOfTwoScans(l, r)); - - testLazyScan(std::move(scan1), l, expectedL, limitL); - testLazyScan(std::move(scan2), r, expectedR, limitR); - }; - implForSwitch(s1, s2, leftRows, rightRows, limit, {}); - implForSwitch(s2, s1, rightRows, leftRows, {}, limit); + IndexScan s2Copy{qec, Permutation::PSO, tripleLeft}; + + IndexScan::setBlocksForJoinOfIndexScans(&s1, &s2); + + // TODO also switch the left and right inputs for the test + auto implForSwitch = + [](IndexScan& l, IndexScan& l2, IndexScan& r, IndexScan& r2, + const auto& expectedL, const auto& expectedR, + const LimitOffsetClause& limitL, const LimitOffsetClause& limitR) { + auto res1 = l.computeResultOnlyForTesting(true); + auto res2 = r.computeResultOnlyForTesting(true); + testLazyScan(convertGenerator(std::move(res1.idTables())), l2, + expectedL, limitL); + testLazyScan(convertGenerator(std::move(res2.idTables())), r2, + expectedR, limitR); + }; + implForSwitch(s1, s1Copy, s2, s2Copy, leftRows, rightRows, limit, {}); + implForSwitch(s2, s2Copy, s1, s1Copy, rightRows, leftRows, {}, limit); } } From 9ada194a1349716f62c9a417508c81fa106cb8f3 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 19 Dec 2024 10:39:51 +0100 Subject: [PATCH 4/4] In the middle of fixing the tests... Signed-off-by: Johannes Kalmbach --- src/engine/IndexScan.cpp | 11 ++++++++-- test/engine/IndexScanTest.cpp | 41 ++++++++++++++++++++--------------- 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/src/engine/IndexScan.cpp b/src/engine/IndexScan.cpp index 279887c728..2b19356e0c 100644 --- a/src/engine/IndexScan.cpp +++ b/src/engine/IndexScan.cpp @@ -209,7 +209,7 @@ std::shared_ptr IndexScan::makeCopyWithAddedPrefilters( Result::Generator IndexScan::chunkedIndexScan() { auto optBlocks = [this]() -> std::optional> { - if (prefilteredBlocks_.has_value()) { + if (getLimit().isUnconstrained() && prefilteredBlocks_.has_value()) { return prefilteredBlocks_; } auto optAllBlocks = getBlockMetadata(); @@ -366,7 +366,8 @@ IndexScan::getBlockMetadataOptionallyPrefiltered() const { // The code after this is expensive because it always copies the complete // block metadata, so we do an early return of `nullopt` (which means "use // all the blocks") if no prefilter is specified. - if (!prefilter_.has_value() && !prefilteredBlocks_.has_value()) { + if ((!prefilter_.has_value() && !prefilteredBlocks_.has_value()) || + !getLimit().isUnconstrained()) { return std::nullopt; } @@ -721,7 +722,13 @@ void IndexScan::setBlocksForJoinOfIndexScans(Operation* left, auto metaBlocks1 = getBlocks(leftScan); auto metaBlocks2 = getBlocks(rightScan); + + // If one of the relations doesn't even exist and therefore has no blocks, + // we know that the join result is empty and can thus inform the other scan; + if (!metaBlocks1.has_value() || !metaBlocks2.has_value()) { + leftScan.prefilteredBlocks_.emplace(); + rightScan.prefilteredBlocks_.emplace(); return; } LOG(INFO) << "Original num blocks: " << metaBlocks1->blockMetadata_.size() diff --git a/test/engine/IndexScanTest.cpp b/test/engine/IndexScanTest.cpp index b067e1b512..7d366bd0dc 100644 --- a/test/engine/IndexScanTest.cpp +++ b/test/engine/IndexScanTest.cpp @@ -57,18 +57,11 @@ void testLazyScan(Permutation::IdTableGenerator partialLazyScanResult, ++numBlocks; } - if (limitOffset.isUnconstrained()) { - EXPECT_EQ(numBlocks, partialLazyScanResult.details().numBlocksRead_); - // The number of read elements might be a bit larger than the final result - // size, because the first and/or last block might be incomplete, meaning - // that they have to be completely read, but only partially contribute to - // the result. - EXPECT_LE(lazyScanRes.size(), - partialLazyScanResult.details().numElementsRead_); - } - auto resFullScan = fullScan.getResult()->idTable().clone(); IdTable expected{resFullScan.numColumns(), alloc}; + std::cout << "Result of full scan" << IdTable(resFullScan.clone()) + << std::endl; + std::cout << fullScan.getDescriptor() << std::endl; if (limitOffset.isUnconstrained()) { for (auto [lower, upper] : expectedRows) { @@ -84,12 +77,14 @@ void testLazyScan(Permutation::IdTableGenerator partialLazyScanResult, } if (limitOffset.isUnconstrained()) { - EXPECT_EQ(lazyScanRes, expected); + EXPECT_EQ(lazyScanRes, expected) << IdTable{resFullScan.clone()}; } else { // If the join on blocks could already determine that there are no matching // blocks, then the lazy scan will be empty even with a limit present. + // TODO Handle the limit EXPECT_TRUE((lazyScanRes.empty() && expectedRows.empty()) || - lazyScanRes == expected); + lazyScanRes == expected) + << "actual:" << lazyScanRes << "expected:" << expected; } } @@ -116,15 +111,25 @@ void testLazyScanForJoinOfTwoScans( IndexScan s1Copy{qec, Permutation::PSO, tripleLeft}; s1.setLimit(limit); IndexScan s2{qec, Permutation::PSO, tripleRight}; - IndexScan s2Copy{qec, Permutation::PSO, tripleLeft}; + IndexScan s2Copy{qec, Permutation::PSO, tripleRight}; IndexScan::setBlocksForJoinOfIndexScans(&s1, &s2); + s1.disableStoringInCache(); + s2.disableStoringInCache(); + + if (!limit.isUnconstrained()) { + std::cout << "has limit\n"; + } // TODO also switch the left and right inputs for the test auto implForSwitch = - [](IndexScan& l, IndexScan& l2, IndexScan& r, IndexScan& r2, - const auto& expectedL, const auto& expectedR, - const LimitOffsetClause& limitL, const LimitOffsetClause& limitR) { + [&qec](IndexScan& l, IndexScan& l2, IndexScan& r, IndexScan& r2, + const auto& expectedL, const auto& expectedR, + const LimitOffsetClause& limitL, const LimitOffsetClause& limitR, + ad_utility::source_location location = + ad_utility::source_location::current()) { + auto tr = generateLocationTrace(location); + qec->getQueryTreeCache().clearAll(); auto res1 = l.computeResultOnlyForTesting(true); auto res2 = r.computeResultOnlyForTesting(true); testLazyScan(convertGenerator(std::move(res1.idTables())), l2, @@ -235,6 +240,7 @@ const auto testSetAndMakeScanWithPrefilterExpr = TEST(IndexScan, lazyScanForJoinOfTwoScans) { SparqlTriple xpy{Tc{Var{"?x"}}, "

", Tc{Var{"?y"}}}; SparqlTriple xqz{Tc{Var{"?x"}}, "", Tc{Var{"?z"}}}; + /* { // In the tests we have a blocksize of two triples per block, and a new // block is started for a new relation. That explains the spacing of the @@ -259,8 +265,9 @@ TEST(IndexScan, lazyScanForJoinOfTwoScans) { // graph), so both lazy scans are empty. testLazyScanForJoinOfTwoScans(kg, xpy, xqz, {}, {}); } + */ { - // No triple for relation (which does appear in the knowledge graph, but + // No triple for relation (which does appear in the knowledge graph, but // not as a predicate), so both lazy scans are empty. std::string kg = "

.

. "