From 07aa566f5d86c4a9a71c85a3cd93184c4e63b62a Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 5 Dec 2024 13:24:15 +0100 Subject: [PATCH] Fix the cost estimate of SORT, s.t. the pubchem queries get the correct query plans. Signed-off-by: Johannes Kalmbach --- src/engine/Sort.cpp | 47 ++++++++++++++++++++++++++++++++++ src/engine/Sort.h | 12 +-------- src/global/RuntimeParameters.h | 4 ++- 3 files changed, 51 insertions(+), 12 deletions(-) diff --git a/src/engine/Sort.cpp b/src/engine/Sort.cpp index f66a8c8bc9..b14c01565a 100644 --- a/src/engine/Sort.cpp +++ b/src/engine/Sort.cpp @@ -9,6 +9,8 @@ #include "engine/CallFixedSize.h" #include "engine/Engine.h" +#include "engine/Filter.h" +#include "engine/IndexScan.h" #include "engine/QueryExecutionTree.h" #include "global/RuntimeParameters.h" @@ -74,3 +76,48 @@ ProtoResult Sort::computeResult([[maybe_unused]] bool requestLaziness) { LOG(DEBUG) << "Sort result computation done." << endl; return {std::move(idTable), resultSortedOn(), subRes->getSharedLocalVocab()}; } + +// _____________________________________________________________________________ +size_t Sort::getCostEstimate() { + size_t size = getSizeEstimateBeforeLimit(); + size_t logSize = + size < 4 ? 2 : static_cast(logb(static_cast(size))); + size_t nlogn = size * logSize; + size_t subcost = subtree_->getCostEstimate(); + // Return at least 1, s.t. the query planner will never emit an unnecessary + // sort of an empty `IndexScan`. This makes the testing of the query + // planner much easier. + + // Don't return plain `n log n` but also incorporate the number of columns and + // a constant multiplicator for the inherent complexity of sorting. + auto result = std::max(1UL, 20 * getResultWidth() * (nlogn + subcost)); + + // Determine if the subtree is a FILTER of an INDEX SCAN. This case can be + // useful if the FILTER can be applied via binary search and the result is + // then so small that the SORT doesn't hurt anymore. But in case the FILTER + // doesn't filter out much, and the result size is beyond a configurable + // threshold, we want to heavily discourage the plan with the binary filter + + // sorting, because it breaks the lazy evaluation. + auto sizeEstimateOfFilteredScan = [&]() -> size_t { + if (auto filter = + dynamic_cast(subtree_->getRootOperation().get())) { + if (dynamic_cast( + filter->getSubtree()->getRootOperation().get())) { + return subtree_->getSizeEstimate(); + } + } + return 0; + }(); + size_t maxSizeFilteredScan = + RuntimeParameters() + .get<"max-materialization-size-filtered-scan">() + .getBytes() / + sizeof(Id) / subtree_->getResultWidth(); + if (sizeEstimateOfFilteredScan > maxSizeFilteredScan) { + // If the filtered result is larger than the defined threshold, make the + // cost estimate much larger, s.t. the query planner will prefer a plan + // without the `SORT`. + result *= 10'000; + } + return result; +} diff --git a/src/engine/Sort.h b/src/engine/Sort.h index d94a69c199..80dcc97143 100644 --- a/src/engine/Sort.h +++ b/src/engine/Sort.h @@ -44,17 +44,7 @@ class Sort : public Operation { std::shared_ptr getSubtree() const { return subtree_; } - virtual size_t getCostEstimate() override { - size_t size = getSizeEstimateBeforeLimit(); - size_t logSize = - size < 4 ? 2 : static_cast(logb(static_cast(size))); - size_t nlogn = size * logSize; - size_t subcost = subtree_->getCostEstimate(); - // Return at least 1, s.t. the query planner will never emit an unnecessary - // sort of an empty `IndexScan`. This makes the testing of the query - // planner much easier. - return std::max(1UL, nlogn + subcost); - } + size_t getCostEstimate() override; virtual bool knownEmptyResult() override { return subtree_->knownEmptyResult(); diff --git a/src/global/RuntimeParameters.h b/src/global/RuntimeParameters.h index 8e60725ffe..c2e52a2bf9 100644 --- a/src/global/RuntimeParameters.h +++ b/src/global/RuntimeParameters.h @@ -53,7 +53,9 @@ inline auto& RuntimeParameters() { Bool<"throw-on-unbound-variables">{false}, // Control up until which size lazy results should be cached. Caching // does cause significant overhead for this case. - MemorySizeParameter<"lazy-result-max-cache-size">{5_MB}}; + MemorySizeParameter<"lazy-result-max-cache-size">{5_MB}, + MemorySizeParameter<"max-materialization-size-filtered-scan">{100_MB}, + }; }(); return params; }