diff --git a/src/Access/AeolusAccessUtil.h b/src/Access/AeolusAccessUtil.h index 856baf98aff..c6ffb0144ef 100644 --- a/src/Access/AeolusAccessUtil.h +++ b/src/Access/AeolusAccessUtil.h @@ -7,7 +7,7 @@ namespace DB { -bool aeolusCheck(const Context & context, const String & full_table_name) +inline bool aeolusCheck(const Context & context, const String & full_table_name) { String access_table_names = context.getSettingsRef().access_table_names; diff --git a/src/Analyzers/SubstituteSelectItemToAnyFunction.cpp b/src/Analyzers/SubstituteSelectItemToAnyFunction.cpp index 4ff06e59cd5..0bcdc6714e7 100644 --- a/src/Analyzers/SubstituteSelectItemToAnyFunction.cpp +++ b/src/Analyzers/SubstituteSelectItemToAnyFunction.cpp @@ -149,7 +149,9 @@ void SubstituteSelectItemToAnyFunction::visit(ASTSelectQuery * select_query) // process having and order by if (!processed_identifier_qualified_names.empty()) { - SubstituteIdentifierToAnyFunction::Data expression_data{{}, processed_identifier_qualified_names, {}, context, false, false}; + QualifiedNames empty_qualified_names; + NameSet empty_aliases; + SubstituteIdentifierToAnyFunction::Data expression_data{empty_qualified_names, processed_identifier_qualified_names, empty_aliases, context, false, false}; SubstituteIdentifierToAnyFunction expression_visitor(expression_data); if (select_query->having()) expression_visitor.visit(select_query->refHaving()); diff --git a/src/Catalog/Catalog.cpp b/src/Catalog/Catalog.cpp index aca8e07d0fc..f517f9dd5e0 100644 --- a/src/Catalog/Catalog.cpp +++ b/src/Catalog/Catalog.cpp @@ -976,9 +976,13 @@ namespace Catalog } StoragePtr storage; - if (auto query_context = CurrentThread::getGroup()->query_context.lock()) - storage = tryGetTableByUUID(*query_context, UUIDHelpers::UUIDToString(uuid), TxnTimestamp::maxTS()); - else + auto thread_group = CurrentThread::getGroup(); + if (thread_group) + { + if (auto query_context = thread_group->query_context.lock()) + storage = tryGetTableByUUID(*query_context, UUIDHelpers::UUIDToString(uuid), TxnTimestamp::maxTS()); + } + if (!storage) storage = tryGetTableByUUID(context, UUIDHelpers::UUIDToString(uuid), TxnTimestamp::maxTS()); if (auto pcm = context.getPartCacheManager(); pcm && storage) @@ -4262,7 +4266,7 @@ namespace Catalog return; start_key.clear(); - auto it = meta_proxy->getAllTransactionRecord(name_space, start_key, max_result_number); + it = meta_proxy->getAllTransactionRecord(name_space, start_key, max_result_number); if (!it->next()) return; } diff --git a/src/CloudServices/CnchMergeMutateThread.cpp b/src/CloudServices/CnchMergeMutateThread.cpp index 87360a13d72..9bcb39ebc80 100644 --- a/src/CloudServices/CnchMergeMutateThread.cpp +++ b/src/CloudServices/CnchMergeMutateThread.cpp @@ -221,16 +221,22 @@ FutureManipulationTask::~FutureManipulationTask() /// Add source parts (include invisible parts) to merging_mutating_parts. FutureManipulationTask & FutureManipulationTask::tagSourceParts(ServerDataPartsVector && parts_) { - auto check_and_add = [&](const auto & part_name) { - if (parent.currently_merging_mutating_parts.count(part_name)) - throw Exception("Part '" + part_name + "' was already in other Task, cancel merge.", ErrorCodes::ABORTED); - parent.currently_merging_mutating_parts.emplace(part_name); - }; - if (!record->try_execute) { std::lock_guard lock(parent.currently_merging_mutating_parts_mutex); + std::vector added; + auto check_and_add = [&](const auto & part_name) { + if (parent.currently_merging_mutating_parts.count(part_name)) + { + for (const auto & n : added) + parent.currently_merging_mutating_parts.erase(n); + throw Exception("Part '" + part_name + "' was already in other Task, cancel merge.", ErrorCodes::ABORTED); + } + parent.currently_merging_mutating_parts.emplace(part_name); + added.push_back(part_name); + }; + for (const auto & p : parts_) { check_and_add(p->name()); diff --git a/src/CloudServices/CnchServerClient.cpp b/src/CloudServices/CnchServerClient.cpp index f5a5d4be66d..b21ff86e3c7 100644 --- a/src/CloudServices/CnchServerClient.cpp +++ b/src/CloudServices/CnchServerClient.cpp @@ -1034,7 +1034,10 @@ brpc::CallId CnchServerClient::submitPreloadTask(const MergeTreeMetaBase & stora auto * cntl = new brpc::Controller(); auto call_id = cntl->call_id(); if (parts.empty()) + { + delete cntl; return call_id; + } Protos::SubmitPreloadTaskReq request; request.set_ts(time(nullptr)); diff --git a/src/CloudServices/CnchServerServiceImpl.cpp b/src/CloudServices/CnchServerServiceImpl.cpp index 01dcec4391f..02c816e4e12 100644 --- a/src/CloudServices/CnchServerServiceImpl.cpp +++ b/src/CloudServices/CnchServerServiceImpl.cpp @@ -751,49 +751,17 @@ void CnchServerServiceImpl::fetchPartitions( ASTPtr query_ptr = deserializeAST(rb); /// We should to add `database` into AST before calling `buildSelectQueryInfoForQuery`. { - StoragePtr storage = gc->getCnchCatalog()->getTable(*gc, request->database(), request->table(), TxnTimestamp::maxTS()); - - auto calculated_host - = gc->getCnchTopologyMaster() - ->getTargetServer(UUIDHelpers::UUIDToString(storage->getStorageUUID()), storage->getServerVwName(), true) - .getRPCAddress(); - - if (request->remote_host() != calculated_host) - throw Exception( - "Fetch partitions failed because of inconsistent view of topology in remote server, remote_host: " - + request->remote_host() + ", calculated_host: " + calculated_host, - ErrorCodes::LOGICAL_ERROR); - - Names column_names; - for (const auto & name : request->column_name_filter()) - column_names.push_back(name); - auto session_context = Context::createCopy(gc); - session_context->setCurrentDatabase(request->database()); - ReadBufferFromString rb(request->predicate()); - ASTPtr query_ptr = deserializeAST(rb); - /// We should to add `database` into AST before calling `buildSelectQueryInfoForQuery`. - { - ASTSelectQuery * select_query = query_ptr->as(); - if (!select_query) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected AST type found in buildSelectQueryInfoForQuery"); - select_query->replaceDatabaseAndTable(request->database(), request->table()); - } - SelectQueryInfo query_info = buildSelectQueryInfoForQuery(query_ptr, session_context); - - session_context->setTemporaryTransaction( - TxnTimestamp(request->has_txnid() ? request->txnid() : session_context->getTimestamp()), 0, false); - auto required_partitions = gc->getCnchCatalog()->getPartitionsByPredicate( - session_context, storage, query_info, column_names, request->has_ignore_ttl() && request->ignore_ttl()); - - response->set_total_size(required_partitions.total_partition_number); - auto & mutable_partitions = *response->mutable_partitions(); - for (auto & partition : required_partitions.partitions) - *mutable_partitions.Add() = std::move(partition); + ASTSelectQuery * select_query = query_ptr->as(); + if (!select_query) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected AST type found in buildSelectQueryInfoForQuery"); + select_query->replaceDatabaseAndTable(request->database(), request->table()); } SelectQueryInfo query_info = buildSelectQueryInfoForQuery(query_ptr, session_context); - session_context->setTemporaryTransaction(TxnTimestamp(request->has_txnid() ? request->txnid() : session_context->getTimestamp()), 0, false); - auto required_partitions = gc->getCnchCatalog()->getPartitionsByPredicate(session_context, storage, query_info, column_names, request->has_ignore_ttl() && request->ignore_ttl()); + session_context->setTemporaryTransaction( + TxnTimestamp(request->has_txnid() ? request->txnid() : session_context->getTimestamp()), 0, false); + auto required_partitions = gc->getCnchCatalog()->getPartitionsByPredicate( + session_context, storage, query_info, column_names, request->has_ignore_ttl() && request->ignore_ttl()); response->set_total_size(required_partitions.total_partition_number); auto & mutable_partitions = *response->mutable_partitions(); diff --git a/src/CloudServices/CnchWorkerClient.cpp b/src/CloudServices/CnchWorkerClient.cpp index 67796bf8cad..7d911b7b54e 100644 --- a/src/CloudServices/CnchWorkerClient.cpp +++ b/src/CloudServices/CnchWorkerClient.cpp @@ -175,6 +175,60 @@ std::vector CnchWorkerClient::getManipulationTasksStatus() return res; } +std::vector CnchWorkerClient::getTTLCacheStats() +{ + brpc::Controller cntl; + Protos::GetTTLCacheStatsReq request; + Protos::GetTTLCacheStatsResp response; + + stub->getTTLCacheStats(&cntl, &request, &response, nullptr); + + assertController(cntl); + RPCHelpers::checkResponse(response); + + std::vector res; + res.reserve(response.tables_size()); + for (const auto & t : response.tables()) + res.push_back(t); + return res; +} + +std::vector CnchWorkerClient::getTTLCachePartitionStats() +{ + brpc::Controller cntl; + Protos::GetTTLCachePartitionStatsReq request; + Protos::GetTTLCachePartitionStatsResp response; + + stub->getTTLCachePartitionStats(&cntl, &request, &response, nullptr); + + assertController(cntl); + RPCHelpers::checkResponse(response); + + std::vector res; + res.reserve(response.partitions_size()); + for (const auto & p : response.partitions()) + res.push_back(p); + return res; +} + +std::vector CnchWorkerClient::getPreloadStats() +{ + brpc::Controller cntl; + Protos::GetPreloadStatsReq request; + Protos::GetPreloadStatsResp response; + + stub->getPreloadStats(&cntl, &request, &response, nullptr); + + assertController(cntl); + RPCHelpers::checkResponse(response); + + std::vector res; + res.reserve(response.partitions_size()); + for (const auto & p : response.partitions()) + res.push_back(p); + return res; +} + void CnchWorkerClient::submitMvRefreshTask( const StorageMaterializedView & , const ManipulationTaskParams & params, TxnTimestamp txn_id) { diff --git a/src/CloudServices/CnchWorkerClient.h b/src/CloudServices/CnchWorkerClient.h index 8aa8e4a6f6c..bf19a0dc82d 100644 --- a/src/CloudServices/CnchWorkerClient.h +++ b/src/CloudServices/CnchWorkerClient.h @@ -43,6 +43,9 @@ namespace DB namespace Protos { class CnchWorkerService_Stub; + class TTLCacheTableStats; + class TTLCachePartitionStats; + class PreloadPartitionStats; } namespace IngestColumnCnch @@ -81,6 +84,9 @@ class CnchWorkerClient : public RpcClientBase void shutdownManipulationTasks(const UUID & table_uuid, const Strings & task_ids = Strings{}); std::unordered_set touchManipulationTasks(const UUID & table_uuid, const Strings & tasks_id); std::vector getManipulationTasksStatus(); + std::vector getTTLCacheStats(); + std::vector getTTLCachePartitionStats(); + std::vector getPreloadStats(); void submitMvRefreshTask( const StorageMaterializedView & storage, const ManipulationTaskParams & params, TxnTimestamp txn_id); diff --git a/src/CloudServices/CnchWorkerServiceImpl.cpp b/src/CloudServices/CnchWorkerServiceImpl.cpp index 53cb069af0c..4e05d4750bc 100644 --- a/src/CloudServices/CnchWorkerServiceImpl.cpp +++ b/src/CloudServices/CnchWorkerServiceImpl.cpp @@ -34,6 +34,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -605,10 +608,23 @@ void CnchWorkerServiceImpl::preloadDataParts( } else { + // Group parts by partition and register with PreloadRegistry before scheduling + // so in-flight counts are visible immediately. + auto & registry = PreloadRegistry::instance(); + String table_name = cloud_merge_tree.getStorageID().getFullNameNotQuoted(); + String table_uuid_str = toString(cloud_merge_tree.getStorageUUID()); + std::unordered_map partition_counts; + for (const auto & part : data_parts) + partition_counts[part->info.partition_id]++; + for (const auto & [pid, cnt] : partition_counts) + registry.registerParts(table_name, table_uuid_str, pid, cnt, preload_level); + ThreadPool * preload_thread_pool = &(IDiskCache::getPreloadPool()); for (const auto & part : data_parts) { - preload_thread_pool->scheduleOrThrowOnError([part, preload_level, submit_ts, read_injection, storage] { + String pid = part->info.partition_id; + preload_thread_pool->scheduleOrThrowOnError([part, preload_level, submit_ts, read_injection, storage, table_uuid_str, pid, ®istry] { + SCOPE_EXIT({ registry.partFinished(table_uuid_str, pid); }); part->remote_fs_read_failed_injection = read_injection; part->disk_cache_mode = DiskCacheMode::SKIP_DISK_CACHE;// avoid getCheckum & getIndex re-cache part->preload(preload_level, submit_ts); @@ -1317,6 +1333,104 @@ void CnchWorkerServiceImpl::getCloudMergeTreeStatus( { } +void CnchWorkerServiceImpl::getTTLCacheStats( + google::protobuf::RpcController *, + const Protos::GetTTLCacheStatsReq *, + Protos::GetTTLCacheStatsResp * response, + google::protobuf::Closure * done) +{ + SUBMIT_THREADPOOL({ + auto ttl_caches = DiskCacheFactory::instance().getAllTableTTLCaches(); + LOG_INFO(log, "getTTLCacheStats: {} TTL cache(s) in registry", ttl_caches.size()); + for (const auto & [uuid, cache_ptr] : ttl_caches) + { + auto * ttl_cache = dynamic_cast(cache_ptr.get()); + if (!ttl_cache) + continue; + + auto stats = ttl_cache->getStats(); + LOG_INFO(log, "getTTLCacheStats: returning stats for table={} uuid={}", ttl_cache->getName(), stats.table_uuid); + auto * t = response->add_tables(); + t->set_table_name(ttl_cache->getName()); + t->set_table_uuid(stats.table_uuid); + t->set_ttl_minutes(ttl_cache->getTTLMinutes()); + t->set_max_size_bytes(ttl_cache->getMaxSizeBytes()); + t->set_last_eviction_run(stats.last_eviction_run); + t->set_evicted_expired(stats.evicted_expired); + t->set_evicted_size_limit(stats.evicted_size_limit); + t->set_async_triggered_evicted(stats.async_eviction_triggered); + t->set_async_skipped_rate_limit_evicted(stats.async_eviction_skipped_rate_limit); + t->set_rejected_non_time_partition(stats.rejected_non_time_partition); + t->set_rejected_too_old(stats.rejected_too_old); + t->set_count_preload(stats.cached_from_preload); + t->set_count_query(stats.cached_from_query); + t->set_bytes_preload(stats.cached_bytes_preload); + t->set_bytes_query(stats.cached_bytes_query); + t->set_count_restored(stats.cached_from_restored); + t->set_bytes_restored(stats.cached_bytes_restored); + t->set_idx_count_preload(stats.cached_idx_from_preload); + t->set_idx_bytes_preload(stats.cached_idx_bytes_preload); + t->set_idx_count_query(stats.cached_idx_from_query); + t->set_idx_bytes_query(stats.cached_idx_bytes_query); + t->set_data_hits(stats.data_hits); + t->set_data_misses(stats.data_misses); + t->set_idx_hits(stats.idx_hits); + t->set_idx_misses(stats.idx_misses); + } + }) +} + +void CnchWorkerServiceImpl::getTTLCachePartitionStats( + google::protobuf::RpcController *, + const Protos::GetTTLCachePartitionStatsReq *, + Protos::GetTTLCachePartitionStatsResp * response, + google::protobuf::Closure * done) +{ + SUBMIT_THREADPOOL({ + auto ttl_caches = DiskCacheFactory::instance().getAllTableTTLCaches(); + LOG_DEBUG(log, "getTTLCachePartitionStats: {} TTL cache(s) in registry", ttl_caches.size()); + for (const auto & [uuid, cache_ptr] : ttl_caches) + { + auto * ttl_cache = dynamic_cast(cache_ptr.get()); + if (!ttl_cache) + continue; + + auto table_stats = ttl_cache->getStats(); + LOG_DEBUG(log, "getTTLCachePartitionStats: returning partition stats for table={} uuid={}", ttl_cache->getName(), table_stats.table_uuid); + for (const auto & ps : ttl_cache->getPartitionStats()) + { + auto * p = response->add_partitions(); + p->set_table_name(ttl_cache->getName()); + p->set_table_uuid(table_stats.table_uuid); + p->set_partition(ps.partition_id); + p->set_entry_count(ps.entry_count); + p->set_bytes(ps.total_bytes); + } + } + }) +} + +void CnchWorkerServiceImpl::getPreloadStats( + google::protobuf::RpcController *, + const Protos::GetPreloadStatsReq *, + Protos::GetPreloadStatsResp * response, + google::protobuf::Closure * done) +{ + SUBMIT_THREADPOOL({ + for (const auto & snap : PreloadRegistry::instance().getSnapshot()) + { + auto * p = response->add_partitions(); + p->set_table_name(snap.table_name); + p->set_table_uuid(snap.table_uuid); + p->set_partition_id(snap.partition_id); + p->set_parts_in_flight(snap.parts_in_flight); + p->set_parts_submitted(snap.parts_submitted); + p->set_elapsed_ms(snap.elapsed_ms); + p->set_preload_level(snap.preload_level); + } + }) +} + #if defined(__clang__) # pragma clang diagnostic pop #else diff --git a/src/CloudServices/CnchWorkerServiceImpl.h b/src/CloudServices/CnchWorkerServiceImpl.h index 49578d86bca..fc5d169a529 100644 --- a/src/CloudServices/CnchWorkerServiceImpl.h +++ b/src/CloudServices/CnchWorkerServiceImpl.h @@ -67,6 +67,24 @@ class CnchWorkerServiceImpl : protected WithMutableContext, public DB::Protos::C Protos::GetManipulationTasksStatusResp * response, google::protobuf::Closure * done) override; + void getTTLCacheStats( + google::protobuf::RpcController * cntl, + const Protos::GetTTLCacheStatsReq * request, + Protos::GetTTLCacheStatsResp * response, + google::protobuf::Closure * done) override; + + void getTTLCachePartitionStats( + google::protobuf::RpcController * cntl, + const Protos::GetTTLCachePartitionStatsReq * request, + Protos::GetTTLCachePartitionStatsResp * response, + google::protobuf::Closure * done) override; + + void getPreloadStats( + google::protobuf::RpcController * cntl, + const Protos::GetPreloadStatsReq * request, + Protos::GetPreloadStatsResp * response, + google::protobuf::Closure * done) override; + void GetPreallocatedStatus( google::protobuf::RpcController *, const Protos::GetPreallocatedStatusReq * request, diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index ba91347891b..e88ec334052 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -106,6 +106,10 @@ M(CreatedReadBufferMMapFailed, "") \ M(DiskReadElapsedMicroseconds, "Total time spent waiting for read syscall. This include reads from page cache.") \ M(DiskWriteElapsedMicroseconds, "Total time spent waiting for write syscall. This include writes to page cache.") \ + M(DiskCacheDecompressMicroseconds, "Time spent decompressing blocks on disk cache hit path.") \ + M(DiskCacheDiskReadMicroseconds, "Time spent in pread/read syscall on disk cache hit path (excludes decompression).") \ + M(DiskCacheUncompressedHit, "UncompressedCache hit: no disk I/O or decompression needed.") \ + M(DiskCacheUncompressedMiss, "UncompressedCache miss: disk read + decompression performed.") \ M(NetworkReceiveElapsedMicroseconds, \ "Total time spent waiting for data to receive or receiving data from network. Only ClickHouse-related network interaction is " \ "included, not by 3rd party libraries.") \ @@ -444,6 +448,10 @@ M(DiskCacheGetTotalOps, "Total count of disk cache get operations") \ M(DiskCacheSetTotalOps, "Total count of disk cache set operations") \ M(DiskCacheSetTotalBytes, "Total of disk cache set operations") \ + M(DiskCacheDataHits, "TTL cache hits for data column segments") \ + M(DiskCacheDataMisses, "TTL cache misses for data column segments") \ + M(DiskCacheIdxHits, "TTL cache hits for skip-index segments") \ + M(DiskCacheIdxMisses, "TTL cache misses for skip-index segments") \ M(DiskCacheDeviceBytesWritten, "Total bytes written of disk cache device") \ M(DiskCacheDeviceBytesRead, "Total bytes read of disk cache device") \ M(DiskCacheDeviceWriteIOErrors, "Total errors of disk cache device write io") \ diff --git a/src/Compression/CachedCompressedReadBuffer.cpp b/src/Compression/CachedCompressedReadBuffer.cpp index 56a976be25a..0c63c95a00c 100644 --- a/src/Compression/CachedCompressedReadBuffer.cpp +++ b/src/Compression/CachedCompressedReadBuffer.cpp @@ -26,6 +26,19 @@ #include "IO/BufferWithOwnMemory.h" #include +#include +#include +#include + +namespace ProfileEvents +{ + extern const Event DiskCacheDecompressMicroseconds; + extern const Event DiskCacheDiskReadMicroseconds; + extern const Event DiskCacheUncompressedHit; + extern const Event DiskCacheUncompressedMiss; +} + +static Poco::Logger * getLog() { return &Poco::Logger::get("CachedCompressedReadBuffer"); } namespace DB @@ -69,9 +82,13 @@ bool CachedCompressedReadBuffer::nextImpl() /// Let's check for the presence of a decompressed block in the cache, grab the ownership of this block, if it exists. UInt128 key = cache->hash(path, file_pos); + bool cache_miss = false; owned_cell = cache->getOrSet(key, [&]() { + cache_miss = true; initInput(); + + Stopwatch io_sw; file_in->seek(file_pos, SEEK_SET); auto cell = std::make_shared(); @@ -79,21 +96,33 @@ bool CachedCompressedReadBuffer::nextImpl() size_t size_decompressed; size_t size_compressed_without_checksum; cell->compressed_size = readCompressedData(size_decompressed, size_compressed_without_checksum, false); + const auto io_us = io_sw.elapsedMicroseconds(); + ProfileEvents::increment(ProfileEvents::DiskCacheDiskReadMicroseconds, io_us); if (cell->compressed_size) { - // * a little bit hack here for reducing memory copy - // * allocate 12 more bytes to store {size_decompressed} and {size_decompressed}, padding at the end of the data cell->additional_bytes = codec->getAdditionalSizeAtTheEndOfBuffer(); auto buffer = HybridCache::Buffer{size_decompressed + cell->additional_bytes + sizeof(cell->compressed_size) + sizeof(cell->additional_bytes)}; cell->data = std::move(buffer); cell->data.shrink(size_decompressed + cell->additional_bytes); + + Stopwatch decompress_sw; decompressTo(reinterpret_cast(cell->data.data()), size_decompressed, size_compressed_without_checksum); + const auto decompress_us = decompress_sw.elapsedMicroseconds(); + ProfileEvents::increment(ProfileEvents::DiskCacheDecompressMicroseconds, decompress_us); + + LOG_DEBUG(getLog(), "[cache-perf] path={} compressed={}B decompressed={}B disk_read={}us decompress={}us", + path, cell->compressed_size, size_decompressed, io_us, decompress_us); } return cell; }); + if (cache_miss) + ProfileEvents::increment(ProfileEvents::DiskCacheUncompressedMiss); + else + ProfileEvents::increment(ProfileEvents::DiskCacheUncompressedHit); + if (owned_cell->data.size() == 0) return false; diff --git a/src/Compression/CompressedReadBufferFromFile.cpp b/src/Compression/CompressedReadBufferFromFile.cpp index c3e4a3185fe..4ebefd9af92 100644 --- a/src/Compression/CompressedReadBufferFromFile.cpp +++ b/src/Compression/CompressedReadBufferFromFile.cpp @@ -26,7 +26,17 @@ #include #include #include +#include +#include +#include +namespace ProfileEvents +{ + extern const Event DiskCacheDiskReadMicroseconds; + extern const Event DiskCacheDecompressMicroseconds; +} + +static Poco::Logger * getLog() { return &Poco::Logger::get("CompressedReadBufferFromFile"); } namespace DB { @@ -48,7 +58,11 @@ bool CompressedReadBufferFromFile::nextImpl() size_t size_decompressed = 0; size_t size_compressed_without_checksum; + Stopwatch io_sw; size_compressed = readCompressedData(size_decompressed, size_compressed_without_checksum, false); + const auto io_us = io_sw.elapsedMicroseconds(); + ProfileEvents::increment(ProfileEvents::DiskCacheDiskReadMicroseconds, io_us); + if (!size_compressed) return false; @@ -60,7 +74,13 @@ bool CompressedReadBufferFromFile::nextImpl() memory.resize(size_decompressed + additional_size_at_the_end_of_buffer); working_buffer = Buffer(memory.data(), &memory[size_decompressed]); + Stopwatch decomp_sw; decompress(working_buffer, size_decompressed, size_compressed_without_checksum); + const auto decomp_us = decomp_sw.elapsedMicroseconds(); + ProfileEvents::increment(ProfileEvents::DiskCacheDecompressMicroseconds, decomp_us); + + LOG_DEBUG(getLog(), "[cache-perf] path={} compressed={}B decompressed={}B disk_read={}us decompress={}us", + file_in.getFileName(), size_compressed, size_decompressed, io_us, decomp_us); /// nextimpl_working_buffer_offset is set in the seek function (lazy seek). So we have to /// check that we are not seeking beyond working buffer. @@ -163,7 +183,11 @@ size_t CompressedReadBufferFromFile::readBig(char * to, size_t n) size_t size_decompressed = 0; size_t size_compressed_without_checksum = 0; + Stopwatch io_sw2; size_t new_size_compressed = readCompressedData(size_decompressed, size_compressed_without_checksum, false); + const auto io_us2 = io_sw2.elapsedMicroseconds(); + ProfileEvents::increment(ProfileEvents::DiskCacheDiskReadMicroseconds, io_us2); + size_compressed = 0; /// file_in no longer points to the end of the block in working_buffer. if (!new_size_compressed) return bytes_read; @@ -174,7 +198,14 @@ size_t CompressedReadBufferFromFile::readBig(char * to, size_t n) /// need to skip some bytes in decompressed data (seek happened before readBig call). if (nextimpl_working_buffer_offset == 0 && size_decompressed + additional_size_at_the_end_of_buffer <= n - bytes_read) { + Stopwatch decomp_sw2; decompressTo(to + bytes_read, size_decompressed, size_compressed_without_checksum); + const auto decomp_us2 = decomp_sw2.elapsedMicroseconds(); + ProfileEvents::increment(ProfileEvents::DiskCacheDecompressMicroseconds, decomp_us2); + + LOG_DEBUG(getLog(), "[cache-perf] path={} compressed={}B decompressed={}B disk_read={}us decompress={}us", + file_in.getFileName(), new_size_compressed, size_decompressed, io_us2, decomp_us2); + bytes_read += size_decompressed; bytes += size_decompressed; } diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 7a25004c3ab..1931f26a736 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -220,6 +220,7 @@ enum PreloadLevelSettings : UInt64 M(UInt64, background_gc_schedule_pool_size, 16, "Number of threads performing data removing related background tasks.", 0) \ M(UInt64, local_disk_cache_thread_pool_size, 16, "Number of threads perforrming background tasks from cache segments from cloud storage to local disk. Only has meaning at server startup.", 0) \ M(UInt64, local_disk_cache_evict_thread_pool_size, 16, "Number of threads perforrming asynchronous remove disk cache file.", 0) \ + M(UInt64, local_disk_cache_preload_thread_pool_size, 16, "Number of threads for preloading parts into local disk cache. Only has meaning at server startup.", 0) \ M(UInt64, \ max_bandwidth_for_disk_cache, \ 0, \ diff --git a/src/IO/S3/PocoHTTPClient.cpp b/src/IO/S3/PocoHTTPClient.cpp index ea90a803dbd..0b543e65cf5 100644 --- a/src/IO/S3/PocoHTTPClient.cpp +++ b/src/IO/S3/PocoHTTPClient.cpp @@ -136,7 +136,7 @@ void PocoHTTPClient::makeRequestInternal( Poco::Logger * log = &Poco::Logger::get("AWSClient"); auto uri = request.GetUri().GetURIString(); - LOG_DEBUG(log, "Make request to: {}", uri); + LOG_TRACE(log, "Make request to: {}", uri); enum class S3MetricType { @@ -296,7 +296,7 @@ void PocoHTTPClient::makeRequestInternal( request.GetContentBody()->seekg(0); auto size = Poco::StreamCopier::copyStream(*request.GetContentBody(), request_body_stream); - LOG_DEBUG(log, "Written {} bytes to request body", size); + LOG_TRACE(log, "Written {} bytes to request body", size); } LOG_TRACE(log, "Receiving response..."); @@ -306,14 +306,14 @@ void PocoHTTPClient::makeRequestInternal( ProfileEvents::increment(select_metric(S3MetricType::Microseconds), watch.elapsedMicroseconds()); int status_code = static_cast(poco_response.getStatus()); - LOG_DEBUG(log, "Response status: {}, {}", status_code, poco_response.getReason()); + LOG_TRACE(log, "Response status: {}, {}", status_code, poco_response.getReason()); if (poco_response.getStatus() == Poco::Net::HTTPResponse::HTTP_TEMPORARY_REDIRECT) { auto location = poco_response.get("location"); remote_host_filter.checkURL(Poco::URI(location)); uri = location; - LOG_DEBUG(log, "Redirecting request to new location: {}", location); + LOG_TRACE(log, "Redirecting request to new location: {}", location); ProfileEvents::increment(select_metric(S3MetricType::Redirects)); @@ -328,7 +328,7 @@ void PocoHTTPClient::makeRequestInternal( response->AddHeader(header_name, header_value); headers_ss << header_name << ": " << header_value << "; "; } - LOG_DEBUG(log, "Received headers: {}", headers_ss.str()); + LOG_TRACE(log, "Received headers: {}", headers_ss.str()); if (status_code == 429 || status_code == 503) { // API throttling diff --git a/src/Interpreters/DistributedStages/PlanSegmentExecutor.cpp b/src/Interpreters/DistributedStages/PlanSegmentExecutor.cpp index 435dc34db81..a5c4d1cc802 100644 --- a/src/Interpreters/DistributedStages/PlanSegmentExecutor.cpp +++ b/src/Interpreters/DistributedStages/PlanSegmentExecutor.cpp @@ -64,6 +64,7 @@ #include #include #include +#include #include #include #include @@ -71,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -296,8 +298,7 @@ void fillPlanSegmentProfile( ContextPtr context, PlanSegment * plan_segment) { - AddressInfo current_address = getLocalAddress(*context); - segment_profile->worker_address = extractExchangeHostPort(current_address); + segment_profile->worker_address = getWorkerID(context); if (query_status) { auto query_status_info = query_status->getInfo(true, context->getSettingsRef().log_profile_events); @@ -324,18 +325,6 @@ void fillPlanSegmentProfile( auto step_profile = GroupedProcessorProfile::aggregateOperatorProfileToStepLevel(grouped_profiles); for (auto & [step_id, profile] : step_profile) segment_profile->profiles.emplace(step_id, profile); - auto & plan = plan_segment->getQueryPlan(); - for (auto & node : plan.getNodes()) - { - if (!node.step->getAttributeDescriptions().empty() && segment_profile->profiles.contains(node.id)) - { - for (auto & att : node.step->getAttributeDescriptions()) - { - auto attribute_ptr = std::make_shared(att.second); - segment_profile->profiles.at(node.id)->attributes.emplace(att.first, attribute_ptr); - } - } - } } } @@ -491,7 +480,7 @@ void PlanSegmentExecutor::doExecute() PlanSegmentDescription::getPlanSegmentDescription(plan_segment_instance->plan_segment, true) ->jsonPlanSegmentDescriptionAsString(collectStepRuntimeProfiles(pipeline))); } - if (context->getSettingsRef().report_segment_profiles && plan_segment) + if ((context->getSettingsRef().report_segment_profiles || context->getSettingsRef().log_segment_profiles) && plan_segment) { segment_profile = std::make_shared(query_log_element->client_info.initial_query_id, plan_segment->getPlanSegmentId()); fillPlanSegmentProfile( @@ -501,14 +490,41 @@ void PlanSegmentExecutor::doExecute() if (context->getSettingsRef().log_processors_profiles) { auto processors_profile_log = context->getProcessorsProfileLog(); + if (processors_profile_log) + processors_profile_log->addLogs(pipeline.get(), + context->getClientInfo().initial_query_id, + std::chrono::system_clock::now(), + plan_segment->getPlanSegmentId()); + } - if (!processors_profile_log) - return; - - processors_profile_log->addLogs(pipeline.get(), - context->getClientInfo().initial_query_id, - std::chrono::system_clock::now(), - plan_segment->getPlanSegmentId()); + // Collect post-execution attributes (e.g. CacheStats from TTL disk cache) into + // attribute_descriptions on TableScanStep, then propagate all attribute_descriptions + // from every plan node into the segment profile. + if (segment_profile && plan_segment) + { + auto & plan = plan_segment->getQueryPlan(); + for (auto & node : plan.getNodes()) + { + if (auto * ts = dynamic_cast(node.step.get())) + { + LOG_DEBUG(logger, "Collecting post-execution attributes for TableScanStep node {}", node.id); + ts->collectPostExecutionAttributes(); + } + auto & descs = node.step->getAttributeDescriptions(); + if (descs.empty()) + continue; + LOG_DEBUG(logger, "Propagating {} attribute(s) from node {} ({}) into segment profile", + descs.size(), node.id, node.step->getName()); + if (!segment_profile->profiles.contains(node.id)) + { + auto m = std::make_shared(); + m->id = node.id; + segment_profile->profiles.emplace(node.id, m); + } + for (auto & [k, v] : descs) + segment_profile->profiles.at(node.id)->attributes.insert_or_assign( + k, std::make_shared(v)); + } } } diff --git a/src/Interpreters/DistributedStages/PlanSegmentReport.cpp b/src/Interpreters/DistributedStages/PlanSegmentReport.cpp index 5e4ca6f3b50..e82a7996196 100644 --- a/src/Interpreters/DistributedStages/PlanSegmentReport.cpp +++ b/src/Interpreters/DistributedStages/PlanSegmentReport.cpp @@ -136,7 +136,7 @@ PlanSegmentExecutor::ExecutionResult convertSuccessPlanSegmentStatusToResult( result.runtime_segment_status.message = "execute success"; result.runtime_segment_status.metrics.final_progress = final_progress.toProto(); result.sender_metrics = senderMetricsToProto(plan_segment_outputs, sender_metrics, execution_address); - if (query_context->getSettingsRef().report_segment_profiles && segment_profile) + if ((query_context->getSettingsRef().report_segment_profiles || query_context->getSettingsRef().log_segment_profiles) && segment_profile) result.segment_profile = segment_profile; return result; diff --git a/src/Interpreters/SegmentScheduler.cpp b/src/Interpreters/SegmentScheduler.cpp index 5bf0a7e4f11..8f9026e1bd6 100644 --- a/src/Interpreters/SegmentScheduler.cpp +++ b/src/Interpreters/SegmentScheduler.cpp @@ -96,7 +96,8 @@ SegmentScheduler::insertPlanSegments(const String & query_id, PlanSegmentTree * } { - if (query_context->isExplainQuery() && query_context->getSettingsRef().report_segment_profiles) + if ((query_context->isExplainQuery() && query_context->getSettingsRef().report_segment_profiles) + || query_context->getSettingsRef().log_segment_profiles) { std::unique_lock lock(segment_profile_mutex); segment_profile_map[query_id]; diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index 7fd8db382f4..20503784c48 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -91,6 +91,11 @@ #include #include #include +#include +#include +#include +#include +#include #include #include #include @@ -794,6 +799,161 @@ void interpretSettings(ASTPtr query, ContextMutablePtr context) } } +static String buildRuntimeStatsJSON(const std::unordered_map & profiles_map) +{ + struct StageAgg + { + String type, name, condition, keys; + UInt64 parts_after = 0, granules_after = 0; + }; + struct StepAgg + { + UInt64 total_parts = 0; + std::vector stages; + }; + + bool has_cache = false; + UInt64 cache_hit_segs = 0, cache_miss_segs = 0, steal_segs = 0, s3_fallback_segs = 0; + UInt64 cache_bytes = 0, s3_bytes = 0, cache_read_ms = 0, s3_read_ms = 0; + UInt64 idx_hit_segs = 0, idx_miss_segs = 0, idx_cache_bytes = 0, idx_s3_bytes = 0; + UInt64 idx_cache_read_ms = 0, idx_s3_read_ms = 0; + std::map index_by_step; + + for (const auto & [seg_id, seg_profiles] : profiles_map) + { + for (const auto & profile : seg_profiles) + { + for (const auto & [step_id, metric] : profile->profiles) + { + if (metric->attributes.count(RuntimeAttributeKeys::CacheStats)) + { + try + { + Poco::JSON::Parser parser; + auto obj = parser.parse(metric->attributes.at(RuntimeAttributeKeys::CacheStats)->description) + .extract(); + has_cache = true; + cache_hit_segs += obj->getValue("cache_hit_segs"); + cache_miss_segs += obj->getValue("cache_miss_segs"); + steal_segs += obj->getValue("steal_segs"); + s3_fallback_segs += obj->getValue("s3_fallback_segs"); + cache_bytes += obj->getValue("cache_bytes"); + s3_bytes += obj->getValue("s3_bytes"); + cache_read_ms += obj->getValue("cache_read_ms"); + s3_read_ms += obj->getValue("s3_read_ms"); + idx_hit_segs += obj->getValue("idx_hit_segs"); + idx_miss_segs += obj->getValue("idx_miss_segs"); + idx_cache_bytes += obj->getValue("idx_cache_bytes"); + idx_s3_bytes += obj->getValue("idx_s3_bytes"); + idx_cache_read_ms += obj->getValue("idx_cache_read_ms"); + idx_s3_read_ms += obj->getValue("idx_s3_read_ms"); + } + catch (...) {} + } + + if (metric->attributes.count(RuntimeAttributeKeys::Indexes)) + { + const auto & additional = metric->attributes.at(RuntimeAttributeKeys::Indexes)->additional; + if (additional.empty()) + continue; + try + { + Poco::JSON::Parser parser; + auto obj = parser.parse(additional).extract(); + auto & step_agg = index_by_step[step_id]; + step_agg.total_parts += obj->getValue("total_parts"); + auto stages = obj->getArray("stages"); + if (step_agg.stages.empty()) + { + for (size_t i = 0; i < stages->size(); ++i) + { + auto s = stages->getObject(i); + StageAgg agg; + agg.type = s->getValue("type"); + if (s->has("name")) agg.name = s->getValue("name"); + if (s->has("condition")) agg.condition = s->getValue("condition"); + if (s->has("keys")) agg.keys = s->getValue("keys"); + agg.parts_after = s->getValue("parts_after"); + agg.granules_after = s->getValue("granules_after"); + step_agg.stages.push_back(std::move(agg)); + } + } + else + { + for (size_t i = 0; i < std::min(stages->size(), step_agg.stages.size()); ++i) + { + auto s = stages->getObject(i); + step_agg.stages[i].parts_after += s->getValue("parts_after"); + step_agg.stages[i].granules_after += s->getValue("granules_after"); + } + } + } + catch (...) {} + } + } + } + } + + if (!has_cache && index_by_step.empty()) + return ""; + + auto runtime_stats = std::make_unique(); + + if (has_cache) + { + auto cache_obj = std::make_unique(); + cache_obj->add("cache_hit_segs", cache_hit_segs); + cache_obj->add("cache_miss_segs", cache_miss_segs); + cache_obj->add("steal_segs", steal_segs); + cache_obj->add("s3_fallback_segs", s3_fallback_segs); + cache_obj->add("cache_bytes", cache_bytes); + cache_obj->add("s3_bytes", s3_bytes); + cache_obj->add("cache_read_ms", cache_read_ms); + cache_obj->add("s3_read_ms", s3_read_ms); + cache_obj->add("idx_hit_segs", idx_hit_segs); + cache_obj->add("idx_miss_segs", idx_miss_segs); + cache_obj->add("idx_cache_bytes", idx_cache_bytes); + cache_obj->add("idx_s3_bytes", idx_s3_bytes); + cache_obj->add("idx_cache_read_ms", idx_cache_read_ms); + cache_obj->add("idx_s3_read_ms", idx_s3_read_ms); + runtime_stats->add(RuntimeAttributeKeys::CacheStats, std::move(cache_obj)); + } + + if (!index_by_step.empty()) + { + auto idx_arr = std::make_unique(); + for (auto & [step_id, step_agg] : index_by_step) + { + auto step_obj = std::make_unique(); + step_obj->add("total_parts", step_agg.total_parts); + auto stages_arr = std::make_unique(); + for (const auto & stage : step_agg.stages) + { + auto s = std::make_unique(); + s->add("type", stage.type); + if (!stage.name.empty()) s->add("name", stage.name); + if (!stage.condition.empty()) s->add("condition", stage.condition); + if (!stage.keys.empty()) s->add("keys", stage.keys); + s->add("parts_after", stage.parts_after); + s->add("granules_after", stage.granules_after); + stages_arr->add(std::move(s)); + } + step_obj->add("stages", std::move(stages_arr)); + idx_arr->add(std::move(step_obj)); + } + runtime_stats->add("IndexUsage", std::move(idx_arr)); + } + + auto outer = std::make_unique(); + outer->add("RuntimeStats", std::move(runtime_stats)); + + WriteBufferFromOwnString buf; + JSONBuilder::FormatSettings json_fmt{.settings = {}}; + JSONBuilder::FormatContext fmt_ctx{.out = buf}; + outer->format(json_fmt, fmt_ctx); + return buf.str(); +} + static std::tuple executeQueryImpl( const char * begin, const char * end, @@ -1553,6 +1713,7 @@ static std::tuple executeQueryImpl( log_queries_min_type = settings.log_queries_min_type, log_queries_min_query_duration_ms = settings.log_queries_min_query_duration_ms.totalMilliseconds(), log_processors_profiles = settings.log_processors_profiles, + log_segment_profiles = settings.log_segment_profiles, status_info_to_query_log, query_id, finish_current_transaction]( @@ -1697,6 +1858,23 @@ static std::tuple executeQueryImpl( elem.used_table_functions = factories_info.table_functions; elem.partition_ids = context->getPartitionIds(); + if (log_segment_profiles) + { + if (auto scheduler = context->getSegmentScheduler()) + { + auto seg_profiles = scheduler->getSegmentsProfile(elem.client_info.current_query_id); + if (!seg_profiles.empty()) + { + auto runtime_stats = buildRuntimeStatsJSON(seg_profiles); + if (!runtime_stats.empty()) + { + if (!elem.segment_profiles) + elem.segment_profiles = std::make_shared>(); + elem.segment_profiles->emplace_back(std::move(runtime_stats)); + } + } + } + } if (log_queries && elem.type >= log_queries_min_type && Int64(elem.query_duration_ms) >= log_queries_min_query_duration_ms) logQuery(context, elem); diff --git a/src/MergeTreeCommon/MergeTreeMetaBase.cpp b/src/MergeTreeCommon/MergeTreeMetaBase.cpp index 51d33f22812..854e6f01519 100644 --- a/src/MergeTreeCommon/MergeTreeMetaBase.cpp +++ b/src/MergeTreeCommon/MergeTreeMetaBase.cpp @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -202,6 +203,13 @@ void MergeTreeMetaBase::setRelativeDataPath(StorageLocation location, const Stri relative_data_path = rel_path; } +IDiskCachePtr MergeTreeMetaBase::getDiskCache() const +{ + // Default implementation: return global LRU cache + // Override in StorageCloudMergeTree for per-table TTL cache support + return DiskCacheFactory::instance().get(DiskCacheType::MergeTree); +} + static void checkKeyExpression(const ExpressionActions & expr, const Block & sample_block, const String & key_name, bool allow_nullable_key) { for (const auto & action : expr.getActions()) diff --git a/src/MergeTreeCommon/MergeTreeMetaBase.h b/src/MergeTreeCommon/MergeTreeMetaBase.h index 106dbd295b4..fc7fa960203 100644 --- a/src/MergeTreeCommon/MergeTreeMetaBase.h +++ b/src/MergeTreeCommon/MergeTreeMetaBase.h @@ -37,6 +37,8 @@ namespace DB { class MutationCommands; +class IDiskCache; +using IDiskCachePtr = std::shared_ptr; class MergeTreeMetaBase : public IStorage, public WithMutableContext, public MergeTreeDataPartTypeHelper { @@ -171,6 +173,11 @@ class MergeTreeMetaBase : public IStorage, public WithMutableContext, public Mer virtual const String& getRelativeDataPath(StorageLocation location) const; void setRelativeDataPath(StorageLocation location, const String & rel_path); + /// Get disk cache (TTL cache if enabled, otherwise global LRU) + /// Override in subclasses to provide per-table TTL cache support + virtual IDiskCachePtr getDiskCache() const; + + bool supportsFinal() const override { return merging_params.mode == MergingParams::Collapsing diff --git a/src/Processors/Transforms/ExplainAnalyzeTransform.cpp b/src/Processors/Transforms/ExplainAnalyzeTransform.cpp index a0c35ee4e5f..c1c71afad2d 100644 --- a/src/Processors/Transforms/ExplainAnalyzeTransform.cpp +++ b/src/Processors/Transforms/ExplainAnalyzeTransform.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -52,6 +53,31 @@ void ExplainAnalyzeTransform::transform(Chunk & chunk) break; } + // Wait for segment profiles to arrive. Profiles are sent before status over separate RPCs, + // but server-side RPC thread scheduling can process status before profile, causing a race. + if (context->getSettingsRef().report_segment_profiles || context->getSettingsRef().log_segment_profiles) + { + size_t expected_profiles = 0; + for (auto & desc : segment_descriptions) + if (desc->segment_id != 0) + expected_profiles += desc->parallel; + + auto profile_wait_start = std::chrono::steady_clock::now(); + while (expected_profiles > 0) + { + auto now = std::chrono::steady_clock::now(); + if (std::chrono::duration_cast(now - profile_wait_start).count() >= 100) + break; + auto current_map = scheduler->getSegmentsProfile(context->getCurrentQueryId()); + size_t received = 0; + for (auto & [seg_id, seg_profiles] : current_map) + received += seg_profiles.size(); + if (received >= expected_profiles) + break; + std::this_thread::sleep_for(std::chrono::milliseconds(2)); + } + } + auto profiles_map = scheduler->getSegmentsProfile(context->getCurrentQueryId()); String explain; if ((kind == ASTExplainQuery::ExplainKind::LogicalAnalyze || kind == ASTExplainQuery::ExplainKind::DistributedAnalyze)) diff --git a/src/Protos/cnch_worker_rpc.proto b/src/Protos/cnch_worker_rpc.proto index 71da188d569..138d4e8a19c 100644 --- a/src/Protos/cnch_worker_rpc.proto +++ b/src/Protos/cnch_worker_rpc.proto @@ -183,6 +183,85 @@ message GetManipulationTasksStatusResp repeated ManipulationTask tasks = 2; } +message TTLCacheTableStats +{ + optional string table_name = 1; + optional string table_uuid = 2; + optional uint64 ttl_minutes = 3; + optional uint64 max_size_bytes = 4; + optional uint64 last_eviction_run = 5; + optional uint64 evicted_expired = 6; + optional uint64 evicted_size_limit = 7; + optional uint64 async_triggered_evicted = 8; + optional uint64 async_skipped_rate_limit_evicted = 9; + optional uint64 rejected_non_time_partition = 12; + optional uint64 rejected_too_old = 13; + optional uint64 count_preload = 14; + optional uint64 count_query = 15; + optional uint64 bytes_preload = 16; + optional uint64 bytes_query = 17; + optional uint64 count_restored = 18; + optional uint64 bytes_restored = 19; + optional uint64 idx_count_preload = 20; + optional uint64 idx_bytes_preload = 21; + optional uint64 idx_count_query = 22; + optional uint64 idx_bytes_query = 23; + optional uint64 data_hits = 24; + optional uint64 data_misses = 25; + optional uint64 idx_hits = 26; + optional uint64 idx_misses = 27; +} + +message GetTTLCacheStatsReq +{ +} + +message GetTTLCacheStatsResp +{ + optional string exception = 1; + repeated TTLCacheTableStats tables = 2; +} + +message TTLCachePartitionStats +{ + optional string table_name = 1; + optional string table_uuid = 2; + optional string partition = 3; + optional uint64 entry_count = 4; + optional uint64 bytes = 5; +} + +message GetTTLCachePartitionStatsReq +{ +} + +message GetTTLCachePartitionStatsResp +{ + optional string exception = 1; + repeated TTLCachePartitionStats partitions = 2; +} + +message PreloadPartitionStats +{ + optional string table_name = 1; + optional string table_uuid = 2; + optional string partition_id = 3; + optional uint64 parts_in_flight = 4; + optional uint64 parts_submitted = 5; + optional uint64 elapsed_ms = 6; + optional uint64 preload_level = 7; +} + +message GetPreloadStatsReq +{ +} + +message GetPreloadStatsResp +{ + optional string exception = 1; + repeated PreloadPartitionStats partitions = 2; +} + message GetPreallocatedStatusReq { required UUID storage_id = 1; @@ -616,4 +695,8 @@ service CnchWorkerService rpc dropPartDiskCache(DropPartDiskCacheReq) returns (DropPartDiskCacheResp); rpc executeDedupTask(ExecuteDedupTaskReq) returns (ExecuteDedupTaskResp); + + rpc getTTLCacheStats(GetTTLCacheStatsReq) returns (GetTTLCacheStatsResp); + rpc getTTLCachePartitionStats(GetTTLCachePartitionStatsReq) returns (GetTTLCachePartitionStatsResp); + rpc getPreloadStats(GetPreloadStatsReq) returns (GetPreloadStatsResp); } diff --git a/src/QueryPlan/IQueryPlanStep.h b/src/QueryPlan/IQueryPlanStep.h index 1425adaafc0..e6015f60f95 100644 --- a/src/QueryPlan/IQueryPlanStep.h +++ b/src/QueryPlan/IQueryPlanStep.h @@ -130,6 +130,14 @@ struct RuntimeAttributeDescription void toProto(Protos::RuntimeAttributeDescription & proto) const; }; +namespace RuntimeAttributeKeys +{ + static constexpr const char * Indexes = "Indexes"; + static constexpr const char * SelectParts = "SelectParts"; + static constexpr const char * TableScanDescription = "TableScanDescription"; + static constexpr const char * CacheStats = "CacheStats"; +} + /// Single step of query plan. class IQueryPlanStep diff --git a/src/QueryPlan/PlanPrinter.cpp b/src/QueryPlan/PlanPrinter.cpp index 9bda33ccf4c..bcb8f260810 100644 --- a/src/QueryPlan/PlanPrinter.cpp +++ b/src/QueryPlan/PlanPrinter.cpp @@ -686,33 +686,40 @@ String PlanPrinter::TextPrinter::printAttributes(PlanNodeBase & plan, const Text size_t step_id = plan.getId(); if (!profiles.contains(step_id) || profiles.at(step_id)->address_to_attributes.empty()) return ""; - if (!settings.query_plan_options.indexes && !settings.selected_parts) + const auto & address_to_attributes = profiles.at(step_id)->address_to_attributes; + bool has_priority_attrs = std::any_of(address_to_attributes.begin(), address_to_attributes.end(), + [](const auto & p) { + return p.second.count(RuntimeAttributeKeys::CacheStats) + || p.second.count(RuntimeAttributeKeys::Indexes); + }); + if (!settings.query_plan_options.indexes && !settings.selected_parts && !has_priority_attrs) return ""; std::stringstream out; - const auto & address_to_attributes = profiles.at(step_id)->address_to_attributes; if (plan.getStep()->getType() == IQueryPlanStep::Type::TableScan) { - String space; for (const auto & [address, attribute] : address_to_attributes) { - if (address_to_attributes.size() > 1) - { - out << intent.detailIntent() << address; - space = " "; - } - if (settings.query_plan_options.indexes && attribute.contains("Indexes")) + String space = " "; + out << intent.detailIntent() << address; + if (settings.query_plan_options.indexes && attribute.contains(RuntimeAttributeKeys::Indexes)) { out << intent.detailIntent() << space << "Indexes:"; - auto index_desc = attribute.at("Indexes"); + auto index_desc = attribute.at(RuntimeAttributeKeys::Indexes); for (const auto & desc : index_desc->name_and_detail) out << intent.detailIntent() << space << " " << desc.second; } if (settings.selected_parts) { - if (attribute.contains("SelectParts")) - out << intent.detailIntent() << space << attribute.at("SelectParts")->description; - if (attribute.contains("TableScanDescription")) - out << intent.detailIntent() << space << attribute.at("TableScanDescription")->description; + if (attribute.contains(RuntimeAttributeKeys::SelectParts)) + out << intent.detailIntent() << space << attribute.at(RuntimeAttributeKeys::SelectParts)->description; + if (attribute.contains(RuntimeAttributeKeys::TableScanDescription)) + out << intent.detailIntent() << space << attribute.at(RuntimeAttributeKeys::TableScanDescription)->description; + } + if (attribute.contains(RuntimeAttributeKeys::CacheStats)) + { + out << intent.detailIntent() << space << "CacheStats:"; + for (const auto & desc : attribute.at(RuntimeAttributeKeys::CacheStats)->name_and_detail) + out << intent.detailIntent() << space << " " << desc.second; } } return out.str(); diff --git a/src/QueryPlan/ReadFromMergeTree.cpp b/src/QueryPlan/ReadFromMergeTree.cpp index cd3934c832d..ac51fc02ab1 100644 --- a/src/QueryPlan/ReadFromMergeTree.cpp +++ b/src/QueryPlan/ReadFromMergeTree.cpp @@ -32,7 +32,10 @@ #include #include #include +#include #include +#include +#include #include #include "Storages/MergeTree/MergeTreeIOSettings.h" #include @@ -46,6 +49,9 @@ namespace ProfileEvents extern const Event SelectedParts; extern const Event SelectedRanges; extern const Event SelectedMarks; + extern const Event IndexGranuleSeekTime; + extern const Event IndexGranuleReadTime; + extern const Event IndexGranuleCalcTime; } namespace DB @@ -172,7 +178,8 @@ static bool isSamePartition(const RangesInDataPart & lhs, const RangesInDataPart static bool canReadInPartitionOrder( const StorageInMemoryMetadata & metadata, const InputOrderInfo & input_order_info, - const ASTSelectQuery & select) + const ASTSelectQuery & select, + ContextPtr context) { if (!metadata.isPartitionKeyDefined() || !metadata.isSortingKeyDefined()) return false; @@ -191,8 +198,27 @@ static bool canReadInPartitionOrder( /// sorting columns should contain partition column auto partition_column_it = std::find(sorting_columns.begin(), sorting_columns.end(), partition_column); + + /// If partition_column is a MATERIALIZED alias (e.g. `date MATERIALIZED toDate(timestamp)`) + /// it won't appear directly in sorting columns. Expand it and retry. + ExpressionActionsPtr expanded_expr; if (partition_column_it == sorting_columns.end()) - return false; + { + auto col_default = metadata.getColumns().getDefault(partition_column); + if (!col_default || col_default->kind != ColumnDefaultKind::Materialized || !col_default->expression) + return false; + + auto mat_key = KeyDescription::getKeyFromAST(col_default->expression, metadata.getColumns(), context); + Names mat_required = mat_key.expression->getRequiredColumns(); + if (mat_required.size() != 1) + return false; + + partition_column_it = std::find(sorting_columns.begin(), sorting_columns.end(), mat_required[0]); + if (partition_column_it == sorting_columns.end()) + return false; + + expanded_expr = mat_key.expression; + } /// Allow table "partition by c order by (a, b, c)" for query "where a={} and b={} order by c", /// where all sorting columns before partition column match single value, @@ -227,9 +253,11 @@ static bool canReadInPartitionOrder( if (partition_key.column_names.front() == *partition_column_it) return true; - /// Allow "partition by func(x) order by (x)" where func is monotonic nondecreasing + /// Allow "partition by func(x) order by (x)" where func is monotonic nondecreasing. + /// For MATERIALIZED columns use the expanded expression; otherwise use the partition key expression. + const ExpressionActions & expr_for_monotonicity = expanded_expr ? *expanded_expr : *partition_key.expression; IFunction::Monotonicity monotonicity; - for (const auto & action : partition_key.expression->getActions()) + for (const auto & action : expr_for_monotonicity.getActions()) { if (action.node->type != ActionsDAG::ActionType::FUNCTION) { @@ -1397,7 +1425,7 @@ void ReadFromMergeTree::initializePipeline(QueryPipeline & pipeline, const Build result.selected_marks, result.selected_ranges); - if (context->getSettingsRef().report_segment_profiles) + if (context->getSettingsRef().report_segment_profiles || context->getSettingsRef().log_segment_profiles) fillRuntimeAttributeDescriptions(result); ProfileEvents::increment(ProfileEvents::SelectedParts, result.selected_parts); @@ -1475,7 +1503,7 @@ void ReadFromMergeTree::initializePipeline(QueryPipeline & pipeline, const Build auto sorting_key_prefix_expr = ExpressionAnalyzer(order_key_prefix_ast, syntax_result, context).getActionsDAG(false); can_read_in_partition_order = (settings.optimize_read_in_partition_order || settings.force_read_in_partition_order) - && canReadInPartitionOrder(*metadata_for_reading, *input_order_info, query_info.query->as()); + && canReadInPartitionOrder(*metadata_for_reading, *input_order_info, query_info.query->as(), context); if (can_read_in_partition_order && result.selected_partitions > 1) { @@ -1759,54 +1787,160 @@ std::shared_ptr ReadFromMergeTree::copy(ContextPtr) const void ReadFromMergeTree::fillRuntimeAttributeDescriptions(const ReadFromMergeTree::AnalysisResult & result) { - auto index_stats = result.index_stats; - if (!result.index_stats.empty()) + const auto & index_stats = result.index_stats; + if (!index_stats.empty()) { RuntimeAttributeDescription index_desc; - for (size_t i = 0; i < index_stats.size(); ++i) + auto stages_array = std::make_unique(); + UInt64 prev_parts = 0; + UInt64 prev_granules = 0; + bool has_prev = false; + for (const auto & stat : index_stats) { - const auto & stat = index_stats[i]; if (stat.type == IndexType::None) continue; - std::stringstream out; - out << "Type: " << indexTypeToString(stat.type) << ";"; + String entry = fmt::format("Type: {};", indexTypeToString(stat.type)); if (!stat.name.empty()) - out << " Name: " << stat.name << ";"; + entry += fmt::format(" Name: {};", stat.name); if (!stat.description.empty()) - out << " Description: " << stat.description << ";"; + entry += fmt::format(" Description: {};", stat.description); if (!stat.used_keys.empty()) - { - String keys = fmt::format("{}", fmt::join(stat.used_keys, ",")); - out << " Keys: " << keys << ";"; - } + entry += fmt::format(" Keys: {};", fmt::join(stat.used_keys, ",")); if (!stat.condition.empty()) - out << " Condition: " << stat.condition << ";"; - out << " Parts: " << stat.num_parts_after; - if (i) - out << '/' << index_stats[i - 1].num_parts_after; - out << ";"; - out << " Granules: " << stat.num_granules_after; - if (i) - out << '/' << index_stats[i - 1].num_granules_after; - out << ";"; - index_desc.name_and_detail.emplace_back(indexTypeToString(stat.type), out.str()); + entry += fmt::format(" Condition: {};", stat.condition); + if (has_prev) + entry += fmt::format(" Parts: {}/{};", stat.num_parts_after, prev_parts); + else + entry += fmt::format(" Parts: {};", stat.num_parts_after); + if (has_prev) + entry += fmt::format(" Granules: {}/{};", stat.num_granules_after, prev_granules); + else + entry += fmt::format(" Granules: {};", stat.num_granules_after); + index_desc.name_and_detail.emplace_back(indexTypeToString(stat.type), std::move(entry)); + + auto stage = std::make_unique(); + stage->add("type", indexTypeToString(stat.type)); + if (!stat.name.empty()) + stage->add("name", stat.name); + if (!stat.condition.empty()) + stage->add("condition", stat.condition); + if (!stat.used_keys.empty()) + stage->add("keys", fmt::to_string(fmt::join(stat.used_keys, ","))); + stage->add("parts_after", stat.num_parts_after); + stage->add("granules_after", stat.num_granules_after); + stages_array->add(std::move(stage)); + + prev_parts = stat.num_parts_after; + prev_granules = stat.num_granules_after; + has_prev = true; } - index_desc.description = "Indexes"; - attribute_descriptions.emplace(index_desc.description, std::move(index_desc)); + + auto idx_json = std::make_unique(); + idx_json->add("total_parts", result.total_parts); + idx_json->add("stages", std::move(stages_array)); + WriteBufferFromOwnString idx_buf; + JSONBuilder::FormatSettings idx_fmt{.settings = {}}; + JSONBuilder::FormatContext idx_ctx{.out = idx_buf}; + idx_json->format(idx_fmt, idx_ctx); + index_desc.additional = idx_buf.str(); + + index_desc.description = RuntimeAttributeKeys::Indexes; + attribute_descriptions.insert_or_assign(RuntimeAttributeKeys::Indexes, std::move(index_desc)); } RuntimeAttributeDescription parts_desc; - String selected_parts_info = fmt::format( - "Selected {}/{} parts by partition key, {} parts by primary key, {}/{} marks by primary key, {} marks to read from {} ranges", + parts_desc.description = fmt::format( + "Selected {}/{} parts by partition key ({} partitions), {} parts by primary key, {}/{} marks by primary key, {} marks to read from {} ranges", result.parts_before_pk, result.total_parts, + result.selected_partitions, result.selected_parts, result.selected_marks_pk, result.total_marks_pk, result.selected_marks, result.selected_ranges); - parts_desc.description = selected_parts_info; - attribute_descriptions.emplace("SelectParts", std::move(parts_desc)); + attribute_descriptions.insert_or_assign(RuntimeAttributeKeys::SelectParts, std::move(parts_desc)); + +} + +void ReadFromMergeTree::collectCacheStats() +{ + auto query_id = CurrentThread::getQueryId().toString(); + LOG_DEBUG(log, "collectCacheStats: query_id={}", query_id); + if (query_id.empty()) + return; + auto cache_stats = DiskCacheFactory::instance().consumeQueryCacheStats(query_id); + if (!cache_stats) + { + LOG_DEBUG(log, "collectCacheStats: no stats found for query_id={}", query_id); + return; + } + LOG_DEBUG(log, "collectCacheStats: data hit={} miss={} steal={} s3_fallback={} cache_bytes={} s3_bytes={} idx hit={} miss={} idx_cache_bytes={} idx_s3_bytes={}", + cache_stats->cache_hit_segs, cache_stats->cache_miss_segs, + cache_stats->steal_segs, cache_stats->s3_fallback_segs, + cache_stats->cache_bytes, cache_stats->s3_bytes, + cache_stats->idx_hit_segs, cache_stats->idx_miss_segs, + cache_stats->idx_cache_bytes, cache_stats->idx_s3_bytes); + JSONBuilder::JSONMap cache_map; + cache_map.add("cache_hit_segs", cache_stats->cache_hit_segs); + cache_map.add("cache_miss_segs", cache_stats->cache_miss_segs); + cache_map.add("steal_segs", cache_stats->steal_segs); + cache_map.add("s3_fallback_segs", cache_stats->s3_fallback_segs); + cache_map.add("cache_bytes", cache_stats->cache_bytes); + cache_map.add("s3_bytes", cache_stats->s3_bytes); + cache_map.add("cache_read_ms", cache_stats->cache_read_ms); + cache_map.add("cache_read_ms_max", cache_stats->cache_read_ms_max); + cache_map.add("cache_read_ms_min", cache_stats->cache_read_ms_min); + cache_map.add("s3_read_ms", cache_stats->s3_read_ms); + cache_map.add("idx_hit_segs", cache_stats->idx_hit_segs); + cache_map.add("idx_miss_segs", cache_stats->idx_miss_segs); + cache_map.add("idx_cache_bytes", cache_stats->idx_cache_bytes); + cache_map.add("idx_s3_bytes", cache_stats->idx_s3_bytes); + cache_map.add("idx_cache_read_ms", cache_stats->idx_cache_read_ms); + cache_map.add("idx_s3_read_ms", cache_stats->idx_s3_read_ms); + WriteBufferFromOwnString buf; + JSONBuilder::FormatSettings json_fmt{.settings = {}}; + JSONBuilder::FormatContext fmt_ctx{.out = buf}; + cache_map.format(json_fmt, fmt_ctx); + RuntimeAttributeDescription cache_desc; + cache_desc.description = buf.str(); + uint64_t cache_wall_ms = cache_stats->reader_count > 0 + ? cache_stats->cache_read_ms / cache_stats->reader_count + : cache_stats->cache_read_ms; + uint64_t s3_wall_ms = cache_stats->reader_count > 0 + ? cache_stats->s3_read_ms / cache_stats->reader_count + : cache_stats->s3_read_ms; + cache_desc.name_and_detail.emplace_back("data", + fmt::format("data: hit={} miss={} steal={} s3={} cache={:.1f}MB ReadTime: {}ms[max={}ms, min={}ms] s3={:.1f}MB/{}ms", + cache_stats->cache_hit_segs, cache_stats->cache_miss_segs, + cache_stats->steal_segs, cache_stats->s3_fallback_segs, + cache_stats->cache_bytes / (1024.0 * 1024.0), + cache_wall_ms, cache_stats->cache_read_ms_max, cache_stats->cache_read_ms_min, + cache_stats->s3_bytes / (1024.0 * 1024.0), s3_wall_ms)); + uint64_t idx_s3_wall_ms = cache_stats->idx_reader_count > 0 + ? cache_stats->idx_s3_read_ms / cache_stats->idx_reader_count + : cache_stats->idx_s3_read_ms; + uint64_t idx_cache_wall_ms = cache_stats->idx_reader_count > 0 + ? cache_stats->idx_cache_read_ms / cache_stats->idx_reader_count + : cache_stats->idx_cache_read_ms; + cache_desc.name_and_detail.emplace_back("idx", + fmt::format("idx: hit={} miss={} cache={:.1f}MB/{}ms s3={:.1f}MB/{}ms", + cache_stats->idx_hit_segs, cache_stats->idx_miss_segs, + cache_stats->idx_cache_bytes / (1024.0 * 1024.0), idx_cache_wall_ms, + cache_stats->idx_s3_bytes / (1024.0 * 1024.0), idx_s3_wall_ms)); + + if (auto * tg = CurrentThread::getGroup().get()) + { + auto seek_us = tg->performance_counters[ProfileEvents::IndexGranuleSeekTime].load(); + auto read_us = tg->performance_counters[ProfileEvents::IndexGranuleReadTime].load(); + auto calc_us = tg->performance_counters[ProfileEvents::IndexGranuleCalcTime].load(); + if (seek_us > 0 || read_us > 0 || calc_us > 0) + cache_desc.name_and_detail.emplace_back("idx_eval", + fmt::format("idx_eval: seek={}ms read={}ms calc={}ms", + seek_us / 1000, read_us / 1000, calc_us / 1000)); + } + + attribute_descriptions.insert_or_assign(RuntimeAttributeKeys::CacheStats, std::move(cache_desc)); } bool MergeTreeDataSelectAnalysisResult::error() const diff --git a/src/QueryPlan/ReadFromMergeTree.h b/src/QueryPlan/ReadFromMergeTree.h index 947754eaf25..1265d5cef69 100644 --- a/src/QueryPlan/ReadFromMergeTree.h +++ b/src/QueryPlan/ReadFromMergeTree.h @@ -149,6 +149,7 @@ class ReadFromMergeTree final : public ISourceStep std::shared_ptr copy(ContextPtr ptr) const override; void fillRuntimeAttributeDescriptions(const ReadFromMergeTree::AnalysisResult & result); + void collectCacheStats(); StorageID getStorageID() const { return data.getStorageID(); } UInt64 getSelectedParts() const { return selected_parts; } diff --git a/src/QueryPlan/TableScanStep.cpp b/src/QueryPlan/TableScanStep.cpp index 1d164347395..39ed06fceec 100644 --- a/src/QueryPlan/TableScanStep.cpp +++ b/src/QueryPlan/TableScanStep.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -1341,17 +1342,17 @@ void TableScanStep::initializePipeline(QueryPipeline & pipeline, const BuildQuer QueryPlanOptimizationSettings::fromContext(build_context.context), BuildQueryPipelineSettings::fromContext(build_context.context)); + for (auto & node : storage_plan.getNodes()) { - for (auto & node : storage_plan.getNodes()) + if (!read_step && dynamic_cast(node.step.get())) + read_step = node.step; + auto & att_descs = node.step->getAttributeDescriptions(); + if (att_descs.empty()) + continue; + for (auto & desc : att_descs) { - auto & att_descs = node.step->getAttributeDescriptions(); - if (att_descs.empty()) - continue; - for (auto & desc : att_descs) - { - if (!attribute_descriptions.contains(desc.first)) - attribute_descriptions.emplace(desc.first, desc.second); - } + if (!attribute_descriptions.contains(desc.first)) + attribute_descriptions.emplace(desc.first, desc.second); } } @@ -1645,7 +1646,7 @@ void TableScanStep::initializePipeline(QueryPipeline & pipeline, const BuildQuer setStepDescription(step_desc.str()); RuntimeAttributeDescription tablescan_desc; tablescan_desc.description = step_desc.str(); - attribute_descriptions.emplace("TableScanDescription", tablescan_desc); + attribute_descriptions.emplace(RuntimeAttributeKeys::TableScanDescription, tablescan_desc); LOG_DEBUG(log, "init pipeline total run time: {} ms, table scan descriptiion: {}", total_watch.elapsedMillisecondsAsDouble(), step_desc.str()); } @@ -2086,4 +2087,18 @@ void TableScanStep::fillQueryInfoV2(ContextPtr context) query_info.index_context = std::make_shared(); } +void TableScanStep::collectPostExecutionAttributes() +{ + auto * rmt = dynamic_cast(read_step.get()); + if (!rmt) + return; + rmt->collectCacheStats(); + auto & rmt_descs = rmt->getAttributeDescriptions(); + LOG_DEBUG(log, "collectPostExecutionAttributes: collected {} attribute(s) from ReadFromMergeTree, has_cache_stats={}", + rmt_descs.size(), rmt_descs.contains(RuntimeAttributeKeys::CacheStats)); + for (auto & [k, v] : rmt_descs) + attribute_descriptions.insert_or_assign(k, v); + read_step.reset(); +} + } diff --git a/src/QueryPlan/TableScanStep.h b/src/QueryPlan/TableScanStep.h index ce71efae9ae..f571bc4bba0 100644 --- a/src/QueryPlan/TableScanStep.h +++ b/src/QueryPlan/TableScanStep.h @@ -87,6 +87,7 @@ class TableScanStep : public ISourceStep Type getType() const override { return Type::TableScan; } void initializePipeline(QueryPipeline & pipeline, const BuildQueryPipelineSettings &) override; + void collectPostExecutionAttributes(); void toProto(Protos::TableScanStep & proto, bool for_hash_equals = false) const; static std::shared_ptr fromProto(const Protos::TableScanStep & proto, ContextPtr context); @@ -219,6 +220,9 @@ class TableScanStep : public ISourceStep // Only for worker. bool is_null_source{false}; + // Kept alive after initializePipeline to allow collectPostExecutionAttributes + // to harvest CacheStats after pipeline execution. + std::shared_ptr read_step; // Optimises the where clauses for a bucket table by rewriting the IN clause and hence reducing the IN set size void rewriteInForBucketTable(ContextPtr context) const; diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index 6c88fa3ce46..e113f6f4390 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -534,6 +534,16 @@ void HTTPHandler::processQuery( /// see also https://github.com/ClickHouse/ClickHouse/pull/26864 context = Context::createCopy(session->context); context->setSessionContext(session->context); + + // Re-apply per-request database/tenant_id: they were set on the old context + // before the session copy replaced it, so they must be restored explicitly. + if (!database.empty()) + context->setCurrentDatabase(database); + if (!tenant_id.empty()) + { + context->setSetting("tenant_id", tenant_id); + context->setTenantId(tenant_id); + } } SCOPE_EXIT({ diff --git a/src/Server/ServerPrometheusMetricsWriter.cpp b/src/Server/ServerPrometheusMetricsWriter.cpp index 16503b45882..c8827f5b062 100644 --- a/src/Server/ServerPrometheusMetricsWriter.cpp +++ b/src/Server/ServerPrometheusMetricsWriter.cpp @@ -7,7 +7,10 @@ #include #include #include +#include +#include #include +#include #include #include #include @@ -16,6 +19,7 @@ #include #include +#include namespace DB { @@ -570,6 +574,79 @@ void ServerPrometheusMetricsWriter::writePartMetrics(WriteBuffer & wb) } +void ServerPrometheusMetricsWriter::writeTTLCacheMetrics(WriteBuffer & wb) +{ + auto caches = DiskCacheFactory::instance().getAllTableTTLCaches(); + if (caches.empty()) + return; + + const String worker_id = getWorkerID(context); + + static constexpr auto PREFIX = "byconity_ttl_cache_"; + + // Emit one gauge or counter line. TYPE/HELP are written once per metric name across all + // tables, so we track which names we've already emitted the header for. + std::unordered_set headers_written; + + auto emit = [&](const char * name, const char * type, const char * help, + const MetricLabels & labels, size_t value) + { + String key{PREFIX}; + key += name; + if (headers_written.insert(key).second) + { + writeOutLine(wb, "# HELP", key, help); + writeOutLine(wb, "# TYPE", key, type); + } + writeOutLine(wb, key + getLabel(labels), value); + }; + + for (auto & [uuid, cache_ptr] : caches) + { + auto * ttl = dynamic_cast(cache_ptr.get()); + if (!ttl) + continue; + + auto s = ttl->getStats(); + const String table_name = cache_ptr->getName(); + MetricLabels base{{"table_name", table_name}, {"worker_id", worker_id}}; + + // gauges — current state, can go up or down + emit("entries", GAUGE_TYPE, "Segments currently cached on disk", base, s.total_entries); + emit("bytes", GAUGE_TYPE, "Bytes currently cached on disk", base, s.total_bytes); + emit("ttl_minutes", GAUGE_TYPE, "Configured TTL window in minutes", base, ttl->getTTLMinutes()); + emit("max_size_bytes", GAUGE_TYPE, "Per-table size cap in bytes (0 = unlimited)", base, ttl->getMaxSizeBytes()); + + // counters — monotonically increasing, use rate() in Prometheus + auto base_q = base; base_q.insert({"write_type", "query"}); + auto base_p = base; base_p.insert({"write_type", "preload"}); + emit("segments_written_total", COUNTER_TYPE, "Segments written to TTL cache", base_q, s.cached_from_query); + emit("segments_written_total", COUNTER_TYPE, "", base_p, s.cached_from_preload); + emit("bytes_written_total", COUNTER_TYPE, "Bytes written to TTL cache", base_q, s.cached_bytes_query); + emit("bytes_written_total", COUNTER_TYPE, "", base_p, s.cached_bytes_preload); + + auto base_exp = base; base_exp.insert({"eviction_type", "expired"}); + auto base_size = base; base_size.insert({"eviction_type", "size_limit"}); + emit("evictions_total", COUNTER_TYPE, "Segments evicted from TTL cache", base_exp, s.evicted_expired); + emit("evictions_total", COUNTER_TYPE, "", base_size, s.evicted_size_limit); + + emit("async_evictions_triggered_total", COUNTER_TYPE, "Async eviction trigger count", base, s.async_eviction_triggered); + + auto base_old = base; base_old.insert({"reason", "too_old"}); + auto base_ntime = base; base_ntime.insert({"reason", "non_time_partition"}); + emit("rejections_total", COUNTER_TYPE, "Segments rejected from TTL cache", base_old, s.rejected_too_old); + emit("rejections_total", COUNTER_TYPE, "", base_ntime, s.rejected_non_time_partition); + + emit("hits_total", COUNTER_TYPE, "Cache segment read hits", base, s.total_hits); + emit("misses_total", COUNTER_TYPE, "Cache segment read misses", base, s.total_misses); + } + + // global gauges — no table label + MetricLabels wlabel{{"worker_id", worker_id}}; + emit("global_bytes", GAUGE_TYPE, "Total bytes across all TTL caches on this worker", wlabel, DiskCacheFactory::instance().getGlobalTTLUsage()); + emit("global_limit_bytes", GAUGE_TYPE, "Global TTL cache limit on this worker", wlabel, DiskCacheFactory::instance().getGlobalTTLLimit()); +} + void ServerPrometheusMetricsWriter::write(WriteBuffer & wb) { writeConfigMetrics(wb); @@ -590,5 +667,7 @@ void ServerPrometheusMetricsWriter::write(WriteBuffer & wb) /// Export the parts related metrics, the values are consistent with the system.cnch_parts writePartMetrics(wb); + + writeTTLCacheMetrics(wb); } } diff --git a/src/Server/ServerPrometheusMetricsWriter.h b/src/Server/ServerPrometheusMetricsWriter.h index ead674ee323..3cfe2c4dbfe 100644 --- a/src/Server/ServerPrometheusMetricsWriter.h +++ b/src/Server/ServerPrometheusMetricsWriter.h @@ -507,50 +507,40 @@ namespace ProfileEvents // extern const Event DropAccessPolicyFailed; extern const Event IsHostServerSuccess; extern const Event IsHostServerFailed; - // extern const Event S3GETMicroseconds; - // extern const Event S3GETBytes; - // extern const Event S3GETRequestsCount; - // extern const Event S3GETRequestsErrors; - // extern const Event S3GETRequestsThrottling; - // extern const Event S3GETRequestsRedirects; - // extern const Event S3HEADMicroseconds; - // extern const Event S3HEADBytes; - // extern const Event S3HEADRequestsCount; - // extern const Event S3HEADRequestsErrors; - // extern const Event S3HEADRequestsThrottling; - // extern const Event S3HEADRequestsRedirects; - // extern const Event S3POSTMicroseconds; - // extern const Event S3POSTBytes; - // extern const Event S3POSTRequestsCount; - // extern const Event S3POSTRequestsErrors; - // extern const Event S3POSTRequestsThrottling; - // extern const Event S3POSTRequestsRedirects; - // extern const Event S3DELETEMicroseconds; - // extern const Event S3DELETEBytes; - // extern const Event S3DELETERequestsCount; - // extern const Event S3DELETERequestsErrors; - // extern const Event S3DELETERequestsThrottling; - // extern const Event S3DELETERequestsRedirects; - // extern const Event S3PATCHMicroseconds; - // extern const Event S3PATCHBytes; - // extern const Event S3PATCHRequestsCount; - // extern const Event S3PATCHRequestsErrors; - // extern const Event S3PATCHRequestsThrottling; - // extern const Event S3PATCHRequestsRedirects; - // extern const Event S3PUTMicroseconds; - // extern const Event S3PUTBytes; - // extern const Event S3PUTRequestsCount; - // extern const Event S3PUTRequestsErrors; - // extern const Event S3PUTRequestsThrottling; - // extern const Event S3PUTRequestsRedirects; - // extern const Event WriteBufferFromS3WriteMicroseconds; - // extern const Event WriteBufferFromS3WriteBytes; - // extern const Event WriteBufferFromS3WriteErrors; - // extern const Event ReadFromS3BufferCount; - // extern const Event ReadBufferFromS3ReadFailed; - // extern const Event ReadBufferFromS3ReadBytes; - // extern const Event ReadBufferFromS3ReadMicroseconds; - // extern const Event S3ReadAheadReaderRead; + extern const Event UncompressedCacheHits; + extern const Event UncompressedCacheMisses; + extern const Event MarkCacheHits; + extern const Event MarkCacheMisses; + extern const Event DiskCacheDataHits; + extern const Event DiskCacheDataMisses; + extern const Event DiskCacheIdxHits; + extern const Event DiskCacheIdxMisses; + extern const Event DiskCacheUncompressedHit; + extern const Event DiskCacheUncompressedMiss; + extern const Event DiskCacheDecompressMicroseconds; + extern const Event DiskCacheDiskReadMicroseconds; + extern const Event IndexGranuleSeekTime; + extern const Event IndexGranuleReadTime; + extern const Event IndexGranuleCalcTime; + extern const Event NetworkReceiveElapsedMicroseconds; + extern const Event NetworkReceiveBytes; + extern const Event NetworkSendBytes; + extern const Event S3ReadMicroseconds; + extern const Event S3ReadRequestsThrottling; + extern const Event ReadBufferFromS3ReadCount; + extern const Event PrewhereSelectedMarks; + extern const Event PocoHTTPS3GetCount; + extern const Event S3ReadRequestsCount; + extern const Event S3ReadRequestsErrors; + extern const Event ReadBufferFromS3ReadBytes; + extern const Event ReadBufferFromS3ReadMicroseconds; + extern const Event PFRAWSReadBufferReadCount; + extern const Event PFRAWSReadBufferPrefetchCount; + extern const Event PFRAWSReadBufferPrefetchUtilCount; + extern const Event PFRAWSReadBufferPrefetchWaitMicro; + extern const Event PFRAWSReadBufferRemoteReadCount; + extern const Event PFRAWSReadBufferRemoteReadBytes; + extern const Event PFRAWSReadBufferReadMicro; extern const Event QueryMemoryLimitExceeded; extern const Event InsertQuery; extern const Event Merge; @@ -716,6 +706,7 @@ class ServerPrometheusMetricsWriter : public IPrometheusMetricsWriter void writeHistogramMetrics(WriteBuffer & wb); void writeInternalMetrics(WriteBuffer & wb); void writePartMetrics(WriteBuffer & wb); + void writeTTLCacheMetrics(WriteBuffer & wb); static constexpr auto MAX_CONCURRENT_DEFAULT_QUERIES_KEY = "max_concurrent_default_queries"; static constexpr auto MAX_CONCURRENT_INSERT_QUERIES_KEY = "max_concurrent_insert_queries"; @@ -1209,51 +1200,46 @@ class ServerPrometheusMetricsWriter : public IPrometheusMetricsWriter ProfileEvents::UniqueKeyIndexMetaCacheMiss, ProfileEvents::UniqueKeyIndexBlockCacheHit, ProfileEvents::UniqueKeyIndexBlockCacheMiss, + /// About uncompressed/mark cache + ProfileEvents::UncompressedCacheHits, + ProfileEvents::UncompressedCacheMisses, + ProfileEvents::MarkCacheHits, + ProfileEvents::MarkCacheMisses, + /// About TTL disk cache hit/miss + ProfileEvents::DiskCacheDataHits, + ProfileEvents::DiskCacheDataMisses, + ProfileEvents::DiskCacheIdxHits, + ProfileEvents::DiskCacheIdxMisses, + ProfileEvents::DiskCacheUncompressedHit, + ProfileEvents::DiskCacheUncompressedMiss, + ProfileEvents::DiskCacheDecompressMicroseconds, + ProfileEvents::DiskCacheDiskReadMicroseconds, + /// About index granule + ProfileEvents::IndexGranuleSeekTime, + ProfileEvents::IndexGranuleReadTime, + ProfileEvents::IndexGranuleCalcTime, + /// About network + ProfileEvents::NetworkReceiveElapsedMicroseconds, + ProfileEvents::NetworkReceiveBytes, + ProfileEvents::NetworkSendBytes, /// About s3 - // ProfileEvents::S3GETMicroseconds, - // ProfileEvents::S3GETBytes, - // ProfileEvents::S3GETRequestsCount, - // ProfileEvents::S3GETRequestsErrors, - // ProfileEvents::S3GETRequestsThrottling, - // ProfileEvents::S3GETRequestsRedirects, - // ProfileEvents::S3HEADMicroseconds, - // ProfileEvents::S3HEADBytes, - // ProfileEvents::S3HEADRequestsCount, - // ProfileEvents::S3HEADRequestsErrors, - // ProfileEvents::S3HEADRequestsThrottling, - // ProfileEvents::S3HEADRequestsRedirects, - // ProfileEvents::S3POSTMicroseconds, - // ProfileEvents::S3POSTBytes, - // ProfileEvents::S3POSTRequestsCount, - // ProfileEvents::S3POSTRequestsErrors, - // ProfileEvents::S3POSTRequestsThrottling, - // ProfileEvents::S3POSTRequestsRedirects, - // ProfileEvents::S3DELETEMicroseconds, - // ProfileEvents::S3DELETEBytes, - // ProfileEvents::S3DELETERequestsCount, - // ProfileEvents::S3DELETERequestsErrors, - // ProfileEvents::S3DELETERequestsThrottling, - // ProfileEvents::S3DELETERequestsRedirects, - // ProfileEvents::S3PATCHMicroseconds, - // ProfileEvents::S3PATCHBytes, - // ProfileEvents::S3PATCHRequestsCount, - // ProfileEvents::S3PATCHRequestsErrors, - // ProfileEvents::S3PATCHRequestsThrottling, - // ProfileEvents::S3PATCHRequestsRedirects, - // ProfileEvents::S3PUTMicroseconds, - // ProfileEvents::S3PUTBytes, - // ProfileEvents::S3PUTRequestsCount, - // ProfileEvents::S3PUTRequestsErrors, - // ProfileEvents::S3PUTRequestsThrottling, - // ProfileEvents::S3PUTRequestsRedirects, - // ProfileEvents::WriteBufferFromS3WriteMicroseconds, - // ProfileEvents::WriteBufferFromS3WriteBytes, - // ProfileEvents::WriteBufferFromS3WriteErrors, - // ProfileEvents::ReadBufferFromS3Read, - // ProfileEvents::ReadBufferFromS3ReadFailed, - // ProfileEvents::ReadBufferFromS3ReadBytes, - // ProfileEvents::ReadBufferFromS3ReadMicroseconds, - // ProfileEvents::S3ReadAheadReaderRead, + ProfileEvents::S3ReadMicroseconds, + ProfileEvents::S3ReadRequestsCount, + ProfileEvents::S3ReadRequestsErrors, + ProfileEvents::S3ReadRequestsThrottling, + ProfileEvents::ReadBufferFromS3ReadBytes, + ProfileEvents::ReadBufferFromS3ReadMicroseconds, + ProfileEvents::ReadBufferFromS3ReadCount, + ProfileEvents::PrewhereSelectedMarks, + ProfileEvents::PocoHTTPS3GetCount, + /// About PFRA (active path when enable_io_pfra=true) + ProfileEvents::PFRAWSReadBufferReadCount, + ProfileEvents::PFRAWSReadBufferPrefetchCount, + ProfileEvents::PFRAWSReadBufferPrefetchUtilCount, + ProfileEvents::PFRAWSReadBufferPrefetchWaitMicro, + ProfileEvents::PFRAWSReadBufferRemoteReadCount, + ProfileEvents::PFRAWSReadBufferRemoteReadBytes, + ProfileEvents::PFRAWSReadBufferReadMicro, ProfileEvents::QueryMemoryLimitExceeded, ProfileEvents::InsertQuery, ProfileEvents::Merge, diff --git a/src/Storages/DiskCache/DiskCacheFactory.cpp b/src/Storages/DiskCache/DiskCacheFactory.cpp index 071e2f55ca8..25ad6dd8ea8 100644 --- a/src/Storages/DiskCache/DiskCacheFactory.cpp +++ b/src/Storages/DiskCache/DiskCacheFactory.cpp @@ -17,14 +17,21 @@ #include #include +#include +#include +#include #include #include #include +#include #include #include +#include #include #include #include +#include +#include namespace DB { @@ -108,6 +115,127 @@ void DiskCacheFactory::shutdown() IDiskCache::close(); } +size_t DiskCacheFactory::getGlobalTTLLimit() const +{ + auto it = caches.find(DiskCacheType::MergeTree); + if (it != caches.end() && it->second) + return it->second->getSettings().ttl_cache_max_size; + return 0; +} + +IDiskCachePtr DiskCacheFactory::createDiskCacheFromTableSettings( + const String & table_name, + const UUID & table_uuid, + Context & context, + const ThrottlerPtr & throttler, + UInt64 ttl_minutes, + size_t max_size_bytes) +{ + Poco::Logger * log = &Poco::Logger::get("DiskCacheFactory"); + DiskCacheSettings cache_settings; + { + auto it = caches.find(DiskCacheType::MergeTree); + if (it != caches.end() && it->second) + cache_settings = it->second->getSettings(); + } + + // Resolve effective limit before any comparison: 0 means "use global limit". + // Multiple tables should each have an explicit per-table limit; the global limit + // is the single-table default. + size_t effective_max_size = max_size_bytes > 0 ? max_size_bytes : cache_settings.ttl_cache_max_size; + + // Check registry first (for worker reuse). + // Compare against effective_max_size so callers passing 0 (no per-table override) + // don't spuriously trigger recreation of a cache that was already created with the global limit. + { + std::lock_guard lock(ttl_cache_registry_mutex); + auto reg_it = per_table_ttl_caches.find(table_uuid); + if (reg_it != per_table_ttl_caches.end()) + { + auto existing = static_pointer_cast(reg_it->second); + if (existing->getTTLMinutes() == ttl_minutes && existing->getMaxSizeBytes() == effective_max_size) + { + LOG_TRACE(log, "Reusing existing TTL cache for {} (UUID: {})", table_name, UUIDHelpers::UUIDToString(table_uuid)); + return reg_it->second; + } + LOG_INFO(log, "TTL cache settings changed for {} (UUID: {}), updating in place (ttl: {}->{}min, max_size: {}->{}bytes)", + table_name, UUIDHelpers::UUIDToString(table_uuid), + existing->getTTLMinutes(), ttl_minutes, + existing->getMaxSizeBytes(), effective_max_size); + existing->updateSettings(ttl_minutes, effective_max_size); + return reg_it->second; + } + } + + // Get volume from ttl_disk_policy + // defaults to disk_policy if not set + VolumePtr volume = context.getStoragePolicy(cache_settings.ttl_disk_policy)->getVolumeByName("local", true); + + // Per-table cache is always TTL-based + auto strategy = std::make_shared(cache_settings); + auto cache = std::make_shared( + table_name, UUIDHelpers::UUIDToString(table_uuid), volume, throttler, cache_settings, strategy, ttl_minutes, effective_max_size); + + LOG_INFO(log, "Created per-table TTL cache for {} (UUID: {}, TTL: {} minutes, max_size: {}GB, policy: {})", + table_name, UUIDHelpers::UUIDToString(table_uuid), ttl_minutes, effective_max_size / (1024*1024*1024), cache_settings.ttl_disk_policy); + + if (auto catalog = context.getCnchCatalog()) + { + try + { + auto metastore = catalog->getMetastore(); + String ns = context.getCnchConfigRef().getString("catalog.name_space", "default"); + String worker_id = getWorkerID(context.shared_from_this()); + String uuid_str = UUIDHelpers::UUIDToString(table_uuid); + // Pass worker_id (not IP) as own identity — stable across pod restarts. + // The DCIREV_ reverse index stores worker_id values; findPeerOwner resolves + // them to host:port at runtime via DiskCacheFactory::resolveWorkerEndpoint. + auto fdb_idx = std::make_shared(metastore, ns, worker_id, uuid_str, worker_id); + static_pointer_cast(cache)->setFDBIndex(std::move(fdb_idx)); + + // Set up worker endpoint resolver on first use (captures rm_client shared_ptr). + if (!worker_endpoint_resolver) + { + auto rm = context.getResourceManagerClient(); + worker_endpoint_resolver = [rm]() -> std::unordered_map { + std::unordered_map result; + if (!rm) + return result; + std::vector workers; + try { rm->getAllWorkers(workers); } + catch (...) { return result; } + for (const auto & w : workers) + if (!w.id.empty()) + result[w.id] = w.host_ports.getRPCAddress(); + return result; + }; + } + } + catch (...) + { + tryLogCurrentException(log, "Failed to create TTLCacheFDBIndex, cache will use disk scan on restart"); + } + } + + // Insert into registry with re-check: if another thread won the race, discard ours. + // load() is called only on the winner so only one disk scan runs per table UUID. + { + std::lock_guard lock(ttl_cache_registry_mutex); + auto [it, inserted] = per_table_ttl_caches.emplace(table_uuid, cache); + if (!inserted) + { + LOG_TRACE(log, "Reusing TTL cache created concurrently for {} (UUID: {})", table_name, UUIDHelpers::UUIDToString(table_uuid)); + return it->second; + } + } + + // Schedule disk scan only for the winning cache object. + auto & thread_pool = IDiskCache::getThreadPool(); + thread_pool.scheduleOrThrowOnError([cache] { cache->load(); }); + + return cache; +} + void DiskCacheFactory::addNewCache(Context & context, const std::string & cache_name, bool create_default) { Poco::Logger * log{&Poco::Logger::get("DiskCacheFactory")}; @@ -144,6 +272,18 @@ void DiskCacheFactory::addNewCache(Context & context, const std::string & cache_ cache_settings.lru_max_nums)); } + // Resolve global TTL cache limit — use TTL disk space when a separate ttl_disk_policy is configured + auto ttl_total_space_unlimited = !cache_settings.ttl_disk_policy.empty() + ? context.getStoragePolicy(cache_settings.ttl_disk_policy)->getVolumeByName("local", true)->getTotalSpace(true) + : total_space_unlimited; + cache_settings.ttl_cache_max_size = (cache_settings.ttl_cache_max_size > 0) + ? cache_settings.ttl_cache_max_size + : static_cast(ttl_total_space_unlimited.bytes * (cache_settings.ttl_cache_max_percent / 100.0)); + + LOG_INFO(log, "{} cache: TTL global limit {}GB", + cache_name, cache_settings.ttl_cache_max_size / (1024*1024*1024)); + + // Global cache always uses LRU (TTL cache is per-table only) if (!cache_settings.meta_cache_size_ratio) { auto disk_cache = std::make_shared( @@ -167,4 +307,99 @@ void DiskCacheFactory::addNewCache(Context & context, const std::string & cache_ } } +void DiskCacheFactory::mergeQueryCacheStats(const String & query_id, const QueryCacheStatsSnapshot & local) +{ + if (local.empty()) + return; + + std::shared_ptr entry; + { + std::shared_lock rl(query_cache_stats_mutex); + auto it = query_cache_stats_map.find(query_id); + if (it != query_cache_stats_map.end()) + entry = it->second; + } + if (!entry) + { + std::unique_lock wl(query_cache_stats_mutex); + auto [it, inserted] = query_cache_stats_map.emplace(query_id, std::make_shared()); + entry = it->second; + } + // Lock-free updates after entry is visible + entry->cache_hit_segs.fetch_add(local.cache_hit_segs, std::memory_order_relaxed); + entry->cache_miss_segs.fetch_add(local.cache_miss_segs, std::memory_order_relaxed); + entry->steal_segs.fetch_add(local.steal_segs, std::memory_order_relaxed); + entry->s3_fallback_segs.fetch_add(local.s3_fallback_segs, std::memory_order_relaxed); + entry->cache_bytes.fetch_add(local.cache_bytes, std::memory_order_relaxed); + entry->s3_bytes.fetch_add(local.s3_bytes, std::memory_order_relaxed); + entry->cache_read_ms.fetch_add(local.cache_read_ms, std::memory_order_relaxed); + entry->s3_read_ms.fetch_add(local.s3_read_ms, std::memory_order_relaxed); + entry->reader_count.fetch_add(1, std::memory_order_relaxed); + for (auto cur = entry->cache_read_ms_max.load(std::memory_order_relaxed); + local.cache_read_ms > cur && !entry->cache_read_ms_max.compare_exchange_weak(cur, local.cache_read_ms, std::memory_order_relaxed);) + ; + for (auto cur = entry->cache_read_ms_min.load(std::memory_order_relaxed); + local.cache_read_ms < cur && !entry->cache_read_ms_min.compare_exchange_weak(cur, local.cache_read_ms, std::memory_order_relaxed);) + ; + entry->idx_hit_segs.fetch_add(local.idx_hit_segs, std::memory_order_relaxed); + entry->idx_miss_segs.fetch_add(local.idx_miss_segs, std::memory_order_relaxed); + entry->idx_cache_bytes.fetch_add(local.idx_cache_bytes, std::memory_order_relaxed); + entry->idx_s3_bytes.fetch_add(local.idx_s3_bytes, std::memory_order_relaxed); + entry->idx_cache_read_ms.fetch_add(local.idx_cache_read_ms, std::memory_order_relaxed); + entry->idx_s3_read_ms.fetch_add(local.idx_s3_read_ms, std::memory_order_relaxed); + if (local.idx_hit_segs > 0 || local.idx_miss_segs > 0 || local.idx_cache_bytes > 0 || local.idx_s3_bytes > 0) + entry->idx_reader_count.fetch_add(1, std::memory_order_relaxed); +} + +std::optional DiskCacheFactory::consumeQueryCacheStats(const String & query_id) +{ + std::unique_lock wl(query_cache_stats_mutex); + auto it = query_cache_stats_map.find(query_id); + if (it == query_cache_stats_map.end()) + return std::nullopt; + + const auto & e = *it->second; + QueryCacheStatsSnapshot snap; + snap.cache_hit_segs = e.cache_hit_segs.load(std::memory_order_relaxed); + snap.cache_miss_segs = e.cache_miss_segs.load(std::memory_order_relaxed); + snap.steal_segs = e.steal_segs.load(std::memory_order_relaxed); + snap.s3_fallback_segs = e.s3_fallback_segs.load(std::memory_order_relaxed); + snap.cache_bytes = e.cache_bytes.load(std::memory_order_relaxed); + snap.s3_bytes = e.s3_bytes.load(std::memory_order_relaxed); + snap.cache_read_ms = e.cache_read_ms.load(std::memory_order_relaxed); + snap.cache_read_ms_max = e.cache_read_ms_max.load(std::memory_order_relaxed); + auto raw_min = e.cache_read_ms_min.load(std::memory_order_relaxed); + snap.cache_read_ms_min = (raw_min == UINT64_MAX) ? 0 : raw_min; + snap.s3_read_ms = e.s3_read_ms.load(std::memory_order_relaxed); + snap.reader_count = e.reader_count.load(std::memory_order_relaxed); + snap.idx_hit_segs = e.idx_hit_segs.load(std::memory_order_relaxed); + snap.idx_miss_segs = e.idx_miss_segs.load(std::memory_order_relaxed); + snap.idx_cache_bytes = e.idx_cache_bytes.load(std::memory_order_relaxed); + snap.idx_s3_bytes = e.idx_s3_bytes.load(std::memory_order_relaxed); + snap.idx_cache_read_ms = e.idx_cache_read_ms.load(std::memory_order_relaxed); + snap.idx_s3_read_ms = e.idx_s3_read_ms.load(std::memory_order_relaxed); + snap.idx_reader_count = e.idx_reader_count.load(std::memory_order_relaxed); + query_cache_stats_map.erase(it); + return snap; +} + + +std::optional DiskCacheFactory::resolveWorkerEndpoint(const String & worker_id) +{ + if (!worker_endpoint_resolver) + return std::nullopt; + + std::lock_guard lk(worker_endpoint_cache_mutex); + time_t now = time(nullptr); + if (now - worker_endpoint_cache_refresh_time >= WORKER_ENDPOINT_CACHE_TTL_SEC) + { + worker_endpoint_cache = worker_endpoint_resolver(); + worker_endpoint_cache_refresh_time = now; + } + auto it = worker_endpoint_cache.find(worker_id); + if (it != worker_endpoint_cache.end()) + return it->second; + return std::nullopt; +} + } diff --git a/src/Storages/DiskCache/DiskCacheFactory.h b/src/Storages/DiskCache/DiskCacheFactory.h index ecd95f8763e..331585eeb29 100644 --- a/src/Storages/DiskCache/DiskCacheFactory.h +++ b/src/Storages/DiskCache/DiskCacheFactory.h @@ -15,9 +15,16 @@ #pragma once +#include #include #include +#include +#include +#include +#include +#include #include +#include #include namespace DB::ErrorCodes @@ -29,6 +36,61 @@ extern const int LOGICAL_ERROR; namespace DB { class Context; +class IVolume; +class Throttler; +using VolumePtr = std::shared_ptr; +using ThrottlerPtr = std::shared_ptr; + +/// Per-query cache stats accumulated on workers and surfaced via segment profiles. +struct QueryCacheStats +{ + std::atomic cache_hit_segs{0}; // data segments served from local TTL cache + std::atomic cache_miss_segs{0}; // data segments not found in local cache + std::atomic steal_segs{0}; // segments fetched from peer via steal RPC + std::atomic s3_fallback_segs{0}; // data segments read directly from S3 + std::atomic cache_bytes{0}; // bytes through cache_buffer for data (local + steal) + std::atomic s3_bytes{0}; // bytes through source_buffer for data (S3) + std::atomic cache_read_ms{0}; + std::atomic cache_read_ms_max{0}; + std::atomic cache_read_ms_min{UINT64_MAX}; + std::atomic s3_read_ms{0}; + std::atomic reader_count{0}; + // Skip-index segment counters (extension .idx) + std::atomic idx_hit_segs{0}; + std::atomic idx_miss_segs{0}; + std::atomic idx_cache_bytes{0}; + std::atomic idx_s3_bytes{0}; + std::atomic idx_cache_read_ms{0}; + std::atomic idx_s3_read_ms{0}; + std::atomic idx_reader_count{0}; +}; + +/// Plain snapshot, used for local accumulation and return values. +struct QueryCacheStatsSnapshot +{ + size_t cache_hit_segs{0}; + size_t cache_miss_segs{0}; + size_t steal_segs{0}; + size_t s3_fallback_segs{0}; + size_t cache_bytes{0}; + size_t s3_bytes{0}; + uint64_t cache_read_ms{0}; + uint64_t cache_read_ms_max{0}; + uint64_t cache_read_ms_min{0}; + uint64_t s3_read_ms{0}; + size_t reader_count{0}; + // Skip-index segment counters (extension .idx) + size_t idx_hit_segs{0}; + size_t idx_miss_segs{0}; + size_t idx_cache_bytes{0}; + size_t idx_s3_bytes{0}; + uint64_t idx_cache_read_ms{0}; + uint64_t idx_s3_read_ms{0}; + size_t idx_reader_count{0}; + + bool empty() const { return cache_hit_segs == 0 && cache_miss_segs == 0 && steal_segs == 0 && s3_fallback_segs == 0 + && idx_hit_segs == 0 && idx_miss_segs == 0; } +}; enum class DiskCacheType { File, // for generic file disk cache @@ -66,8 +128,66 @@ class DiskCacheFactory : public ext::singleton return it->second; } + /// Create per-table TTL cache instance from table settings + IDiskCachePtr createDiskCacheFromTableSettings( + const String & table_name, + const UUID & table_uuid, + Context & context, + const ThrottlerPtr & throttler, + UInt64 ttl_minutes, + size_t max_size_bytes = 0); + + /// Return a snapshot of all registered per-table TTL caches (UUID → cache ptr). + std::unordered_map getAllTableTTLCaches() const + { + std::lock_guard lock(ttl_cache_registry_mutex); + return per_table_ttl_caches; + } + + /// Remove a per-table TTL cache entry from the registry. + /// Called when disk_cache_ttl_hours is set to 0 so re-enabling creates a fresh object. + void removeTableTTLCache(const UUID & table_uuid) + { + std::lock_guard lock(ttl_cache_registry_mutex); + per_table_ttl_caches.erase(table_uuid); + } + + /// Global TTL cache usage tracking + /// shared across all per-table TTL caches + void addGlobalTTLUsage(size_t bytes) { global_ttl_cache_usage.fetch_add(bytes); } + void releaseGlobalTTL(size_t bytes) { global_ttl_cache_usage.fetch_sub(bytes); } + size_t getGlobalTTLUsage() const { return global_ttl_cache_usage.load(); } + size_t getGlobalTTLLimit() const; + + /// Per-query cache stats registry. + /// unique_lock only for first insertion, then atomic fetch_add on the fields. + void mergeQueryCacheStats(const String & query_id, const QueryCacheStatsSnapshot & local); + std::optional consumeQueryCacheStats(const String & query_id); + + /// Resolve a stable worker_id (e.g. byconity-vw-vw-default-0) to its current RPC + /// host:port by querying the Resource Manager. Result cached for 30 seconds. + std::optional resolveWorkerEndpoint(const String & worker_id); + private: void addNewCache(Context & context, const std::string & cache_name, bool create_default); std::unordered_map caches; + + /// Per-table TTL cache registry (for workers) + std::unordered_map per_table_ttl_caches; + mutable std::mutex ttl_cache_registry_mutex; + + /// Global TTL cache usage tracking + std::atomic global_ttl_cache_usage{0}; + + /// Per-query cache stats (query_id → shared stats object) + std::unordered_map> query_cache_stats_map; + mutable std::shared_mutex query_cache_stats_mutex; + + /// Worker endpoint resolution: worker_id → host:port, refreshed every 30s from RM. + std::function()> worker_endpoint_resolver; + mutable std::mutex worker_endpoint_cache_mutex; + std::unordered_map worker_endpoint_cache; + time_t worker_endpoint_cache_refresh_time{0}; + static constexpr int WORKER_ENDPOINT_CACHE_TTL_SEC = 30; }; } diff --git a/src/Storages/DiskCache/DiskCacheLRU.cpp b/src/Storages/DiskCache/DiskCacheLRU.cpp index 7d4830fc744..aad87ea97f6 100644 --- a/src/Storages/DiskCache/DiskCacheLRU.cpp +++ b/src/Storages/DiskCache/DiskCacheLRU.cpp @@ -207,7 +207,7 @@ static fs::path getRelativePathForPart(const String & part_name, const String & return fs::path(prefix) / hex_key.substr(0, 3) / hex_key / ""; } -void DiskCacheLRU::set(const String& seg_name, ReadBuffer& value, size_t weight_hint, bool is_preload) +void DiskCacheLRU::set(const String& seg_name, ReadBuffer& value, size_t weight_hint, bool is_preload, time_t min_time, time_t max_time) { if (is_droping) { diff --git a/src/Storages/DiskCache/DiskCacheLRU.h b/src/Storages/DiskCache/DiskCacheLRU.h index eb617aab200..b2e00361184 100644 --- a/src/Storages/DiskCache/DiskCacheLRU.h +++ b/src/Storages/DiskCache/DiskCacheLRU.h @@ -75,7 +75,7 @@ class DiskCacheLRU: public IDiskCache const IDiskCacheStrategyPtr & strategy_, IDiskCache::DataType type_ = IDiskCache::DataType::ALL); - void set(const String& seg_name, ReadBuffer& value, size_t weight_hint, bool is_preload) override; + void set(const String& seg_name, ReadBuffer& value, size_t weight_hint, bool is_preload, time_t min_time = 0, time_t max_time = 0) override; std::pair get(const String& seg_name) override; void load() override; size_t drop(const String & part_name) override; diff --git a/src/Storages/DiskCache/DiskCacheSettings.cpp b/src/Storages/DiskCache/DiskCacheSettings.cpp index ff69f2cd6ee..585533585cf 100644 --- a/src/Storages/DiskCache/DiskCacheSettings.cpp +++ b/src/Storages/DiskCache/DiskCacheSettings.cpp @@ -24,6 +24,7 @@ void DiskCacheSettings::loadFromConfig(const Poco::Util::AbstractConfiguration & { std::string config_prefix = fmt::format("{}.{}", root, disk_cache_name); // {root}.MergeTree disk_policy = config.getString(config_prefix + ".disk_policy", "default"); + ttl_disk_policy = config.getString(config_prefix + ".ttl_disk_policy", disk_policy); // Fallback to disk_policy lru_max_nums = config.getUInt64(config_prefix + ".lru_max_object_num", std::numeric_limits::max()); // Todo: process the case which disk not have 2 TB free space lru_max_size = config.getUInt64(config_prefix + ".lru_max_size", static_cast(2) * 1024 * 1024 * 1024 * 1024); @@ -50,16 +51,23 @@ void DiskCacheSettings::loadFromConfig(const Poco::Util::AbstractConfiguration & stealing_max_retry_times = config.getUInt(config_prefix + ".stealing_max_retry_times", 3); stealing_retry_sleep_ms = config.getUInt(config_prefix + ".stealing_retry_sleep_ms", 100); stealing_max_queue_count = config.getUInt(config_prefix + ".stealing_max_queue_count", 10000); + + // TTL cache settings + ttl_cache_max_size = config.getUInt64(config_prefix + ".ttl_cache_max_size", 0); + ttl_cache_max_percent = config.getDouble(config_prefix + ".ttl_cache_max_percent", 80.0); } std::string DiskCacheSettings::toString() const - { +{ return fmt::format( R"({{ - "disk_policy": {}, + "disk_policy": "{}", + "ttl_disk_policy": "{}", "lru_max_percent": {}, "lru_max_size": {}, "lru_max_nums": {}, + "ttl_cache_max_size": {}, + "ttl_cache_max_percent": {}, "random_drop_threshold": {}, "mapping_bucket_size": {}, "lru_update_interval": {}, @@ -74,13 +82,22 @@ std::string DiskCacheSettings::toString() const "stats_bucket_size": {}, "previous_disk_cache_dir": "{}", "latest_disk_cache_dir": "{}", - "meta_cache_size_ratio": "{}", - "meta_cache_nums_ratio": "{}" + "meta_cache_size_ratio": {}, + "meta_cache_nums_ratio": {}, + "stealing_max_request_rate": {}, + "stealing_connection_timeout_ms": {}, + "stealing_read_timeout_ms": {}, + "stealing_max_retry_times": {}, + "stealing_retry_sleep_ms": {}, + "stealing_max_queue_count": {} }})", disk_policy, + ttl_disk_policy, lru_max_percent, lru_max_size, lru_max_nums, + ttl_cache_max_size, + ttl_cache_max_percent, random_drop_threshold, mapping_bucket_size, lru_update_interval, @@ -96,6 +113,12 @@ std::string DiskCacheSettings::toString() const previous_disk_cache_dir, latest_disk_cache_dir, meta_cache_size_ratio, - meta_cache_nums_ratio); - } + meta_cache_nums_ratio, + stealing_max_request_rate, + stealing_connection_timeout_ms, + stealing_read_timeout_ms, + stealing_max_retry_times, + stealing_retry_sleep_ms, + stealing_max_queue_count); +} } diff --git a/src/Storages/DiskCache/DiskCacheSettings.h b/src/Storages/DiskCache/DiskCacheSettings.h index 13eebd0406b..4b37ed8117e 100644 --- a/src/Storages/DiskCache/DiskCacheSettings.h +++ b/src/Storages/DiskCache/DiskCacheSettings.h @@ -29,10 +29,15 @@ struct DiskCacheSettings void loadFromConfig(const Poco::Util::AbstractConfiguration & conf, const std::string & disk_cache_name); String disk_policy {"default"}; + String ttl_disk_policy {""}; // Storage policy for TTL cache, empty = use disk_policy size_t lru_max_size {std::numeric_limits::max()}; size_t lru_max_nums {std::numeric_limits::max()}; // max percent of disk total capacity size_t lru_max_percent {80}; + // TTL cache max size (bytes). 0 = use ttl_cache_max_percent instead + size_t ttl_cache_max_size {0}; + // TTL cache max percent of disk capacity (used if ttl_cache_max_size == 0) + double ttl_cache_max_percent {80.0}; // When queue size exceed random drop ratio, start drop disk cache task, range from 0 - 100 size_t random_drop_threshold {50}; // Cache mapping bucket size diff --git a/src/Storages/DiskCache/DiskCacheTTL.cpp b/src/Storages/DiskCache/DiskCacheTTL.cpp new file mode 100644 index 00000000000..bb93863a067 --- /dev/null +++ b/src/Storages/DiskCache/DiskCacheTTL.cpp @@ -0,0 +1,1257 @@ +/* + * Copyright (2022) Bytedance Ltd. and/or its affiliates + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "Common/Exception.h" +#include "Common/hex.h" +#include "common/logger_useful.h" +#include +#include +#include +#include "Interpreters/Context.h" +#include "Storages/DiskCache/DiskCache_fwd.h" +#include "Storages/DiskCache/IDiskCache.h" +#include +#include +#include +#include + +namespace fs = std::filesystem; + +namespace CurrentMetrics +{ + extern const Metric DiskCacheEvictQueueLength; +} + +namespace ProfileEvents +{ + extern const Event DiskCacheGetMetaMicroSeconds; + extern const Event DiskCacheGetTotalOps; + extern const Event DiskCacheSetTotalOps; + extern const Event DiskCacheSetTotalBytes; + extern const Event DiskCacheDataHits; + extern const Event DiskCacheDataMisses; + extern const Event DiskCacheIdxHits; + extern const Event DiskCacheIdxMisses; +} + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int SYSTEM_ERROR; + extern const int LOGICAL_ERROR; + extern const int BAD_ARGUMENTS; +} + +static constexpr auto DISK_CACHE_TEMP_FILE_SUFFIX = ".temp"; +static constexpr auto META_DISK_CACHE_DIR_PREFIX = "meta"; +static constexpr auto DATA_DISK_CACHE_DIR_PREFIX = "data"; + +namespace +{ + constexpr size_t HEX_KEY_LEN = sizeof(DiskCacheTTL::KeyType) * 2; + + // Extract UUID from segment/part name (format: uuid/part_name/...) + String extractUUID(const String & seg_name) + { + size_t first_slash = seg_name.find('/'); + if (first_slash == std::string::npos) + return seg_name; + + return seg_name.substr(0, first_slash); + } + + // Extract part_name from segment name (format: uuid/part_name/column_segment.ext) + String extractPartName(const String & seg_name) + { + size_t first_slash = seg_name.find('/'); + if (first_slash == std::string::npos) + return seg_name; + + size_t second_slash = seg_name.find('/', first_slash + 1); + if (second_slash == std::string::npos) + return seg_name.substr(first_slash + 1); // Return everything after uuid/ + + return seg_name.substr(first_slash + 1, second_slash - first_slash - 1); + } + + // Extract partition_id from part_name (format: 20240315_1_100_2 → 20240315) + String extractPartitionId(const String & part_name) + { + size_t underscore_pos = part_name.find('_'); + if (underscore_pos == std::string::npos) + return part_name; + + return part_name.substr(0, underscore_pos); + } + + // Get relative path for part with new structure + // Structure: prefix/uuid/partition/3char/hash_part/ + // Example: data/a1b2c3.../20240315/abc/abc123def456/ + fs::path getRelativePathForPart(const String & uuid, const String & part_name, const String & prefix) + { + String partition_id = extractPartitionId(part_name); + auto hash_part = sipHash64(part_name.data(), part_name.size()); + String hex_hash(HEX_KEY_LEN / 2, '\0'); + writeHexUIntLowercase(hash_part, hex_hash.data()); + + return fs::path(prefix) / uuid / partition_id / hex_hash.substr(0, 3) / hex_hash / ""; + } + + String formatPartitionId(time_t ts) + { + struct tm t; + gmtime_r(&ts, &t); + return fmt::format("{:04d}{:02d}{:02d}", t.tm_year + 1900, t.tm_mon + 1, t.tm_mday); + } + + bool isHexKey(const String & hex_key) + { + if (hex_key.size() != HEX_KEY_LEN) + return false; + + for (char c : hex_key) + { + if (!(isNumericASCII(c) || (c >= 'a' && c <= 'f'))) + return false; + } + + return true; + } + + bool isHexHalf(const String & s) + { + if (s.size() != HEX_KEY_LEN / 2) + return false; + for (char c : s) + if (!(isNumericASCII(c) || (c >= 'a' && c <= 'f'))) + return false; + return true; + } +} + +DiskCacheTTL::DiskCacheTTL( + const String & name_, + const String & table_uuid_, + const VolumePtr & volume_, + const ThrottlerPtr & throttler_, + const DiskCacheSettings & settings_, + const IDiskCacheStrategyPtr & strategy_, + UInt64 ttl_minutes_, + size_t max_size_bytes_, + IDiskCache::DataType type_) + : IDiskCache(name_, volume_, throttler_, settings_, strategy_, false, type_) + , set_rate_throttler(settings_.cache_set_rate_limit == 0 ? nullptr : std::make_shared(settings_.cache_set_rate_limit)) + , set_throughput_throttler(settings_.cache_set_throughput_limit == 0 ? nullptr : std::make_shared(settings_.cache_set_throughput_limit)) + , table_uuid(table_uuid_) + , ttl_minutes(ttl_minutes_) + , max_size_bytes(max_size_bytes_) // Already calculated by factory +{ + cache_stats.table_uuid = table_uuid_; + LOG_INFO(log, "Initialized TTL cache for table {} with ttl_minutes={}, max_size_bytes={} ({}GB)", + table_uuid_, ttl_minutes_, max_size_bytes, max_size_bytes / (1024*1024*1024)); + if (settings.cache_load_dispatcher_drill_down_level < -1) + { + throw Exception(fmt::format("Load dispatcher's drill down level {} invalid, " + "must be positive or -1", settings.cache_load_dispatcher_drill_down_level), + ErrorCodes::BAD_ARGUMENTS); + } + // load() is called by the factory after this object wins the registry race, + // so only one disk scan runs per table UUID. +} + +DiskCacheTTL::KeyType DiskCacheTTL::hash(const String & seg_key) +{ + // seg_key format: "uuid/part_name/column.bin/offset_0" + // hash_high = hash(part_name only) for grouping all segments of a part + // hash_low = hash(column + segment) for unique segment identification + + size_t first_slash = seg_key.find('/'); + if (first_slash == std::string::npos) + throw Exception("Invalid seg key: " + seg_key, ErrorCodes::LOGICAL_ERROR); + + size_t second_slash = seg_key.find('/', first_slash + 1); + if (second_slash == std::string::npos) + throw Exception("Invalid seg key: " + seg_key, ErrorCodes::LOGICAL_ERROR); + + // hash_high = hash(part_name) - all segments in same part share this + auto high = sipHash64(seg_key.data() + first_slash + 1, second_slash - first_slash - 1); + + // hash_low = hash(column/segment) - unique per segment + auto low = sipHash64(seg_key.data() + second_slash + 1, seg_key.size() - second_slash - 1); + + return {high, low}; +} + +String DiskCacheTTL::hexKey(const KeyType & key) +{ + std::string res(HEX_KEY_LEN, '\0'); + writeHexUIntLowercase(key, res.data()); + return res; +} + +std::optional DiskCacheTTL::unhexKey(const String & hex_key) +{ + if (!isHexKey(hex_key)) + return {}; + + auto low = unhexUInt(hex_key.data()); + auto high = unhexUInt(hex_key.data() + HEX_KEY_LEN / 2); + + return UInt128{high, low}; +} + +fs::path DiskCacheTTL::getPath(const DiskCacheTTL::KeyType & hash_key, const String & path, const String & seg_name, const String & prefix) const +{ + // New structure: uuid/partition/3char/hash_part/hash_low + // Example: a1b2c3d4.../20240315/abc/abc123def456/567890abcd + + String hex_key = hexKey(hash_key); + std::string_view view(hex_key); + std::string_view hex_key_low = view.substr(0, HEX_KEY_LEN / 2); + std::string_view hex_key_high = view.substr(HEX_KEY_LEN / 2, HEX_KEY_LEN); + + String part_name = extractPartName(seg_name); + String partition_id = extractPartitionId(part_name); + String data_prefix = endsWith(seg_name, DATA_FILE_EXTENSION) ? DATA_DISK_CACHE_DIR_PREFIX : META_DISK_CACHE_DIR_PREFIX; + + // Structure: prefix/uuid/partition/3char/hash_high/hash_low + return fs::path(path) / (prefix.empty() ? data_prefix : prefix) + / table_uuid / partition_id + / hex_key_high.substr(0, 3) / hex_key_high / hex_key_low; +} + +time_t DiskCacheTTL::parsePartitionTimestamp(const String & part_name) +{ + try + { + // Extract part name from segment path (format: uuid/part_name/segment_name) + size_t first_slash = part_name.find('/'); + if (first_slash == std::string::npos) + return 0; + + size_t second_slash = part_name.find('/', first_slash + 1); + String actual_part_name; + if (second_slash != std::string::npos) + actual_part_name = part_name.substr(first_slash + 1, second_slash - first_slash - 1); + else + actual_part_name = part_name.substr(first_slash + 1); + + // Parse partition_id from part name + MergeTreePartInfo info; + if (!MergeTreePartInfo::tryParsePartName(actual_part_name, &info, MergeTreeDataFormatVersion(1))) + return 0; + + const String & partition_id = info.partition_id; + if (partition_id.empty()) + return 0; + + // Try to parse as date/datetime + // Common formats: YYYYMMDD, YYYYMMDDHH, YYYYMM + if (partition_id.size() >= 8 && std::all_of(partition_id.begin(), partition_id.end(), ::isdigit)) + { + // Parse as YYYYMMDD + int year = std::stoi(partition_id.substr(0, 4)); + int month = std::stoi(partition_id.substr(4, 2)); + int day = partition_id.size() >= 8 ? std::stoi(partition_id.substr(6, 2)) : 1; + + struct tm tm_info = {}; + tm_info.tm_year = year - 1900; + tm_info.tm_mon = month - 1; + tm_info.tm_mday = day; + tm_info.tm_hour = 0; + tm_info.tm_min = 0; + tm_info.tm_sec = 0; + tm_info.tm_isdst = -1; + + return mktime(&tm_info); + } + + return 0; + } + catch (...) + { + return 0; + } +} + +bool DiskCacheTTL::shouldCache(time_t part_ts) const +{ + // TTL cache only for time-based partitions + if (part_ts == 0) + return false; // Non-time partitions are not cached + + // TTL disabled, defensively not cache + if (ttl_minutes == 0) + return false; + + time_t now = time(nullptr); + time_t age_seconds = now - part_ts; + time_t ttl_seconds = ttl_minutes * 60; + + return age_seconds <= ttl_seconds; +} + +void DiskCacheTTL::cacheInsertLocked(Shard & shard, KeyType key, std::shared_ptr meta, const String & precomputed_partition_id) +{ + shard.cache_map[key] = meta; + UInt64 hash_high = key.items[0]; + auto & entry = shard.part_index[hash_high]; + if (entry.partition_id.empty()) + { + entry.partition_id = precomputed_partition_id.empty() + ? formatPartitionId(meta->max_timestamp) + : precomputed_partition_id; + entry.partition_ts = meta->max_timestamp; + } + entry.keys.insert(key); + entry.total_bytes += meta->size; +} + +DiskCacheTTL::CacheEraseResult DiskCacheTTL::cacheEraseLocked(Shard & shard, KeyType key) +{ + CacheEraseResult result; + auto it = shard.cache_map.find(key); + if (it == shard.cache_map.end()) + return result; + + size_t bytes = it->second->size; + shard.cache_map.erase(it); + + UInt64 hash_high = key.items[0]; + auto pit = shard.part_index.find(hash_high); + if (pit != shard.part_index.end()) + { + result.partition_id = pit->second.partition_id; + result.partition_ts = pit->second.partition_ts; + result.count = 1; + result.bytes = bytes; + + pit->second.total_bytes -= bytes; + pit->second.keys.erase(key); + if (pit->second.keys.empty()) + shard.part_index.erase(pit); + } + return result; +} + +DiskCacheTTL::CacheEraseResult DiskCacheTTL::cacheErasePartLocked(Shard & shard, UInt64 hash_high) +{ + CacheEraseResult result; + auto pit = shard.part_index.find(hash_high); + if (pit == shard.part_index.end()) + return result; + + result.partition_id = pit->second.partition_id; + result.partition_ts = pit->second.partition_ts; + result.hash_high = hash_high; + result.count = pit->second.keys.size(); + result.bytes = pit->second.total_bytes; + + for (const auto & key : pit->second.keys) + { + auto it = shard.cache_map.find(key); + if (it != shard.cache_map.end()) + { + if (it->second->disk && !it->second->rel_path.empty()) + result.files.emplace_back(it->second->disk, it->second->rel_path); + shard.cache_map.erase(it); + } + } + + shard.part_index.erase(pit); + return result; +} + +void DiskCacheTTL::addToPartitionStats(const String & partition_id, time_t partition_ts, size_t bytes, size_t count) +{ + { + std::shared_lock lk(cache_stats.partition_stats_mutex); + auto it = cache_stats.partition_stats.find(partition_id); + if (it != cache_stats.partition_stats.end()) + { + it->second.entry_count += count; + it->second.total_bytes += bytes; + return; + } + } + std::unique_lock lk(cache_stats.partition_stats_mutex); + auto [it, inserted] = cache_stats.partition_stats.try_emplace(partition_id); + if (inserted) + { + it->second.partition_id = partition_id; + it->second.partition_timestamp = partition_ts; + } + it->second.entry_count += count; + it->second.total_bytes += bytes; +} + +void DiskCacheTTL::applyEraseResults(std::vector & results, size_t & total_evicted, const char * log_tag) +{ + for (auto & result : results) + { + for (const auto & [disk, rel_path] : result.files) + { + try { disk->removeFileIfExists(rel_path); } + catch (...) { tryLogCurrentException(log, log_tag); } + } + if (fdb_index) + fdb_index->evictPart(result.partition_id, result.hash_high); + subtractFromPartitionStats(result); + total_evicted += result.count; + } +} + +void DiskCacheTTL::subtractFromPartitionStats(const CacheEraseResult & result) +{ + if (result.count == 0) + return; + // shared_lock suffices: we only decrement existing atomics, no map insert/rehash. + std::shared_lock lk(cache_stats.partition_stats_mutex); + auto it = cache_stats.partition_stats.find(result.partition_id); + if (it == cache_stats.partition_stats.end()) + return; + auto & ps = it->second; + ps.entry_count -= result.count; + ps.total_bytes -= result.bytes; +} + +void DiskCacheTTL::set(const String& seg_name, ReadBuffer& value, size_t weight_hint, bool is_preload, time_t /*min_time*/, time_t max_time) +{ + if (is_droping) + { + LOG_WARNING(log, fmt::format("skip write disk cache for droping disk cache is running")); + return; + } + + if (weight_hint == 0) + return; + + // Use provided max_time if available, else parse from partition_id + time_t part_ts = (max_time > 0) ? max_time : parsePartitionTimestamp(seg_name); + if (!shouldCache(part_ts)) + { + if (part_ts == 0) + cache_stats.rejected_non_time_partition++; + else + cache_stats.rejected_too_old++; + LOG_TRACE(log, "Skipping cache for expired partition: {}", seg_name); + return; + } + + if (set_rate_throttler) + { + set_rate_throttler->add(1); + } + + ProfileEvents::increment(ProfileEvents::DiskCacheSetTotalOps, 1, Metrics::MetricType::Rate, {{"type", (is_preload ? "preload": "query")}}); + + auto key = hash(seg_name); + String part_name = extractPartName(seg_name); + String partition_id = extractPartitionId(part_name); + bool is_idx_seg = endsWith(seg_name, INDEX_FILE_EXTENSION); + time_t cached_at = time(nullptr); + + auto & shard = getShard(key.items[0]); + + // First lock: check if already exists, reserve slot + { + Stopwatch wait_sw; + std::unique_lock lock(shard.mutex); + if (wait_sw.elapsedMicroseconds() > 1000) + LOG_WARNING(log, "[ttl-perf] set() first lock waited {} us", wait_sw.elapsedMicroseconds()); + + if (shard.cache_map.find(key) != shard.cache_map.end()) + return; + + shard.cache_map[key] = std::make_shared( + DiskCacheTTLMeta::State::Caching, nullptr, 0, cached_at, part_ts + ); + } + + ReservationPtr reserved_space = nullptr; + try + { + reserved_space = volume->reserve(weight_hint); + if (reserved_space == nullptr) + { + throw Exception("Failed to reserve space", ErrorCodes::BAD_ARGUMENTS); + } + + String cache_rel_path = getRelativePath(key, seg_name).string(); + size_t weight = writeSegment(value, reserved_space, cache_rel_path); + ProfileEvents::increment(ProfileEvents::DiskCacheSetTotalBytes, weight, Metrics::MetricType::Rate, {{"type", (is_preload ? "preload": "query")}}); + + { + Stopwatch wait_sw; + std::unique_lock lock(shard.mutex); + if (wait_sw.elapsedMicroseconds() > 1000) + LOG_WARNING(log, "[ttl-perf] set() second lock waited {} us", wait_sw.elapsedMicroseconds()); + + auto meta = std::make_shared( + DiskCacheTTLMeta::State::Cached, reserved_space->getDisk(), weight, cached_at, part_ts, cache_rel_path + ); + cacheInsertLocked(shard, key, meta, partition_id); + total_entries++; + total_size += weight; + + // Track write source (preload vs query), split by segment type + if (is_preload) + { + if (is_idx_seg) { cache_stats.cached_idx_from_preload++; cache_stats.cached_idx_bytes_preload += weight; } + else { cache_stats.cached_from_preload++; cache_stats.cached_bytes_preload += weight; } + } + else + { + if (is_idx_seg) { cache_stats.cached_idx_from_query++; cache_stats.cached_idx_bytes_query += weight; } + else { cache_stats.cached_from_query++; cache_stats.cached_bytes_query += weight; } + } + + // Update global TTL usage + DiskCacheFactory::instance().addGlobalTTLUsage(weight); + } + + // Update partition stats outside shard mutex to avoid lock ordering with partition_stats_mutex + addToPartitionStats(partition_id, part_ts, weight); + + if (fdb_index) + fdb_index->onSet(key, seg_name, weight, part_ts); + + // Async size-based eviction once the hard cap is exceeded. + // max_size_bytes is always set (factory falls back to global limit when no per-table limit + // is configured), so one check suffices. Done after updatePartitionStats so the + // just-added partition is visible to evictOldestPartitionsUntilSpace. + if (max_size_bytes > 0 && total_size.load() > max_size_bytes) + { + time_t now = time(nullptr); + time_t last_trigger = last_size_eviction_trigger.load(); + + // Rate limit: at most once per 10 seconds + if (now - last_trigger > 10) + { + if (last_size_eviction_trigger.compare_exchange_strong(last_trigger, now)) + { + size_t excess = total_size.load() - max_size_bytes; + size_t target_free = excess + max_size_bytes * 0.10; + cache_stats.async_eviction_triggered++; + LOG_DEBUG(log, "Table cache {}% full, scheduling async eviction to free {} bytes", + (total_size.load() * 100 / max_size_bytes), target_free); + + auto & thread_pool = IDiskCache::getEvictPool(); + thread_pool.scheduleOrThrow([this, target_free] { + Stopwatch watch; + evictOldestPartitionsUntilSpace(target_free); + LOG_INFO(log, "Async size-based eviction freed space in {} ms", + watch.elapsedMilliseconds()); + }); + } + } + else + { + cache_stats.async_eviction_skipped_rate_limit++; + } + } + } + catch(const Exception & e) + { + String local_disk_path = reserved_space == nullptr ? "" : reserved_space->getDisk()->getPath(); + tryLogCurrentException(log, fmt::format("Failed to write key {} " + "to local, disk path: {}, weight: {}, fail: {}", seg_name, local_disk_path, weight_hint, e.message())); + + std::unique_lock lock(shard.mutex); + cacheEraseLocked(shard, key); // also cleans up part_index reservation slot + } +} + +std::pair DiskCacheTTL::get(const String & seg_name) +{ + ProfileEvents::increment(ProfileEvents::DiskCacheGetTotalOps); + Stopwatch watch; + SCOPE_EXIT({ProfileEvents::increment(ProfileEvents::DiskCacheGetMetaMicroSeconds, + watch.elapsedMicroseconds());}); + + // Periodic eviction check (every 5 minutes) + time_t now = time(nullptr); + time_t last_check = last_eviction_check.load(); + if (now - last_check > 300) + { + if (last_eviction_check.compare_exchange_strong(last_check, now)) + { + // Trigger eviction asynchronously + auto & thread_pool = IDiskCache::getEvictPool(); + thread_pool.scheduleOrThrow([this] { evictExpired(); }); + } + } + + auto key = hash(seg_name); + bool is_idx_seg = endsWith(seg_name, INDEX_FILE_EXTENSION); + + DiskPtr disk; + String rel_path; + CacheEraseResult erase_result; + + auto & shard = getShard(key.items[0]); + { + Stopwatch wait_sw; + std::shared_lock lock(shard.mutex); + if (wait_sw.elapsedMicroseconds() > 1000) + LOG_WARNING(log, "[ttl-perf] get() lock waited {} us, shard_size={}", wait_sw.elapsedMicroseconds(), shard.cache_map.size()); + + auto it = shard.cache_map.find(key); + if (it == shard.cache_map.end() || it->second->state != DiskCacheTTLMeta::State::Cached) + { + if (is_idx_seg) { cache_stats.idx_misses++; ProfileEvents::increment(ProfileEvents::DiskCacheIdxMisses); } + else { cache_stats.data_misses++; ProfileEvents::increment(ProfileEvents::DiskCacheDataMisses); } + } + else if (unlikely(it->second->disk == nullptr)) + { + // Corrupted entry: upgrade to exclusive lock to erase + lock.unlock(); + std::unique_lock ulock(shard.mutex); + auto it2 = shard.cache_map.find(key); + if (it2 != shard.cache_map.end() && it2->second->disk == nullptr) + { + LOG_ERROR(log, "Cached entry {} has null disk — corrupted meta, evicting", seg_name); + erase_result = cacheEraseLocked(shard, key); + if (erase_result.count > 0) + { + total_entries--; + total_size -= erase_result.bytes; + DiskCacheFactory::instance().releaseGlobalTTL(erase_result.bytes); + } + } + if (is_idx_seg) { cache_stats.idx_misses++; ProfileEvents::increment(ProfileEvents::DiskCacheIdxMisses); } + else { cache_stats.data_misses++; ProfileEvents::increment(ProfileEvents::DiskCacheDataMisses); } + } + else + { + if (is_idx_seg) { cache_stats.idx_hits++; ProfileEvents::increment(ProfileEvents::DiskCacheIdxHits); } + else { cache_stats.data_hits++; ProfileEvents::increment(ProfileEvents::DiskCacheDataHits); } + disk = it->second->disk; + rel_path = it->second->rel_path; + } + } + + if (erase_result.count > 0) + subtractFromPartitionStats(erase_result); + + return {disk, rel_path}; +} + +size_t DiskCacheTTL::writeSegment(ReadBuffer& buffer, ReservationPtr& reservation, const String& cache_rel_path) +{ + DiskPtr disk = reservation->getDisk(); + String temp_cache_rel_path = cache_rel_path + ".temp"; + + try + { + disk->createDirectories(fs::path(cache_rel_path).parent_path()); + + size_t written_size = 0; + { + WriteBufferFromFile to( + fs::path(disk->getPath()) / temp_cache_rel_path, DBMS_DEFAULT_BUFFER_SIZE, -1, 0666, nullptr, 0, set_throughput_throttler); + copyData(buffer, to, reservation.get()); + to.finalize(); + written_size = to.count(); + } + + disk->replaceFile(temp_cache_rel_path, cache_rel_path); + + if (disk->getFileSize(cache_rel_path) != written_size) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "cached {} file size {} doesn't match written size {}", + cache_rel_path, + disk->getFileSize(cache_rel_path), + written_size); + + return written_size; + } + catch (...) + { + disk->removeFileIfExists(temp_cache_rel_path); + disk->removeFileIfExists(cache_rel_path); + throw; + } +} + +void DiskCacheTTL::evictExpired() +{ + // Single lock: scan + erase in one critical section — no disk I/O happens inside. + // Collect expired hash_highs first, then erase in a second pass + // to avoid iterator invalidation from cacheErasePartLocked. + std::vector erase_results; + size_t evicted_bytes = 0; + + for (auto & shard : shards) + { + std::unique_lock lock(shard.mutex); + + std::vector expired_hash_highs; + for (const auto & [hash_high, entry] : shard.part_index) + { + if (entry.keys.empty()) continue; + auto sample = shard.cache_map.find(*entry.keys.begin()); + if (sample != shard.cache_map.end() && !shouldCache(sample->second->max_timestamp)) + expired_hash_highs.push_back(hash_high); + } + + for (UInt64 hash_high : expired_hash_highs) + { + auto result = cacheErasePartLocked(shard, hash_high); + if (result.count > 0) + { + total_entries -= result.count; + total_size -= result.bytes; + evicted_bytes += result.bytes; + erase_results.push_back(std::move(result)); + } + } + } + + if (erase_results.empty()) + { + cache_stats.last_eviction_run = time(nullptr); + return; + } + + size_t total_evicted = 0; + applyEraseResults(erase_results, total_evicted, "Failed to evict expired segment"); + cache_stats.evicted_expired += total_evicted; + DiskCacheFactory::instance().releaseGlobalTTL(evicted_bytes); + + LOG_INFO(log, "Evicted {} expired segments, freed {} bytes", total_evicted, evicted_bytes); + cache_stats.last_eviction_run = time(nullptr); +} + +void DiskCacheTTL::evictOldestPartitionsUntilSpace(size_t needed_bytes) +{ + size_t cur = total_size.load(); + size_t target_size = cur > needed_bytes ? cur - needed_bytes : 0; + + LOG_DEBUG(log, "Size eviction: current={}, needed={}, target={}", cur, needed_bytes, target_size); + + size_t total_evicted = 0; + size_t evicted_bytes = 0; + + for (auto & shard : shards) + { + if (total_size.load() <= target_size) + break; + + // Snapshot part timestamps under a short lock — no allocations, just push_backs. + std::vector> by_ts; // (partition_ts, hash_high) + { + std::unique_lock lock(shard.mutex); + by_ts.reserve(shard.part_index.size()); + for (const auto & [hash_high, entry] : shard.part_index) + by_ts.emplace_back(entry.partition_ts, hash_high); + } + + // Sort oldest-first outside the lock. + std::sort(by_ts.begin(), by_ts.end()); + + // Evict under a second lock. Parts may have been removed between the two locks; + // cacheErasePartLocked returns count=0 for missing entries and is skipped. + std::vector erase_results; + { + std::unique_lock lock(shard.mutex); + size_t current_size = total_size.load(); + for (auto & [ts, hash_high] : by_ts) + { + if (current_size <= target_size) + break; + + auto result = cacheErasePartLocked(shard, hash_high); + if (result.count > 0) + { + total_entries -= result.count; + total_size -= result.bytes; + current_size -= result.bytes; + evicted_bytes += result.bytes; + erase_results.push_back(std::move(result)); + } + } + } + + applyEraseResults(erase_results, total_evicted, "Failed to evict segment for size limit"); + } + + if (total_evicted > 0) + { + cache_stats.evicted_size_limit += total_evicted; + DiskCacheFactory::instance().releaseGlobalTTL(evicted_bytes); + LOG_INFO(log, "Evicted {} segments from oldest parts for size limit, freed {} bytes", + total_evicted, evicted_bytes); + } +} + +void DiskCacheTTL::load() +{ + if (fdb_index) + { + auto result = fdb_index->reconcile( + volume, + [this](UInt128 key, const String & seg_name) { return getRelativePath(key, seg_name); }, + [this](time_t ts) { return shouldCache(ts); }, + [this](TTLCacheFDBIndex::ReconcileBatch & batch) { + // Group by shard: one lock per shard instead of one per entry. + std::array>>, NUM_SHARDS> by_shard; + for (auto & [key, meta] : batch) + by_shard[key.items[0] & (NUM_SHARDS - 1)].emplace_back(key, meta); + for (size_t i = 0; i < NUM_SHARDS; ++i) + { + if (by_shard[i].empty()) + continue; + std::unique_lock lock(shards[i].mutex); + for (auto & [key, meta] : by_shard[i]) + cacheInsertLocked(shards[i], key, meta); + } + + // Batch stats update: one lock per unique partition instead of one per entry. + std::unordered_map> stats_acc; // pid -> (ts, bytes, count) + for (auto & [key, meta] : batch) + { + auto pid = formatPartitionId(meta->max_timestamp); + auto & [ts, bytes, count] = stats_acc[pid]; + ts = meta->max_timestamp; + bytes += meta->size; + count++; + } + for (auto & [pid, tbc] : stats_acc) + { + auto & [ts, bytes, count] = tbc; + addToPartitionStats(pid, ts, bytes, count); + cache_stats.cached_from_restored += count; + cache_stats.cached_bytes_restored += bytes; + } + }); + + if (result) + { + auto [entries, bytes] = *result; + // fetch_add: concurrent set() calls may have already bumped these counters between cache registration and now + total_entries.fetch_add(entries, std::memory_order_relaxed); + total_size.fetch_add(bytes, std::memory_order_relaxed); + + LOG_INFO(log, "TTL cache for {} recovered from FDB index: {} entries, {} bytes", + table_uuid, entries, bytes); + return; + } + // reconcile() already logged the per-entry summary (restored/stale counts) + LOG_WARNING(log, "TTL cache for {}: FDB index had no restorable entries, falling back to disk scan", table_uuid); + } + else + { + LOG_WARNING(log, "TTL cache for {}: no FDB index available, loading from disk scan", table_uuid); + } + + LOG_INFO(log, "Loading TTL disk cache from disk scan for {}...", table_uuid); + + for (const auto & disk : volume->getDisks()) + { + DiskCacheLoader loader(*this, disk, settings.cache_loader_per_disk, + settings.cache_load_dispatcher_drill_down_level, + settings.cache_load_dispatcher_drill_down_level); + + for (const auto & dir_path : previous_disk_cache_dirs) + { + if (disk->exists(dir_path)) + loader.exec(dir_path); + } + + if (disk->exists(latest_disk_cache_dir)) + loader.exec(latest_disk_cache_dir); + + LOG_INFO(log, "Loaded {} segments from disk {}", loader.total_loaded, disk->getName()); + } + + LOG_INFO(log, "TTL disk cache load complete. Total: {} segments, {} bytes", total_entries.load(), total_size.load()); + + // Post-scan eviction: trigger synchronously now that partition_stats are fully populated. + // This handles the deadlock where a disk that was overfull before restart has all subsequent + // set() calls fail at volume->reserve() before reaching the eviction check in the write path, + // leaving the cache stuck full with no way to self-recover via normal writes. + // max_size_bytes is always set (factory falls back to global limit), so one check suffices. + // Use hard cap (not 90%) — max_size_bytes already encodes the configured percent of disk. + if (max_size_bytes > 0 && total_size.load() > max_size_bytes) + { + size_t excess = total_size.load() - max_size_bytes; + size_t target_free = excess + max_size_bytes * 0.10; + LOG_INFO(log, "Post-scan eviction triggered: total_size={}, max={}, freeing {} bytes", + total_size.load(), max_size_bytes, target_free); + evictOldestPartitionsUntilSpace(target_free); + } +} + +size_t DiskCacheTTL::drop(const String & part_base_path) +{ + // New structure: uuid/partition/3char/hash_part/ + // part_base_path format: "uuid/part_name" + fs::path meta_path, data_path; + + if (part_base_path.empty()) + { + // Drop entire cache for this table + if (type == DataType::ALL || type == DataType::META) + meta_path = fs::path(latest_disk_cache_dir) / META_DISK_CACHE_DIR_PREFIX / table_uuid; + if (type == DataType::ALL || type == DataType::DATA) + data_path = fs::path(latest_disk_cache_dir) / DATA_DISK_CACHE_DIR_PREFIX / table_uuid; + } + else + { + // Drop specific part: extract uuid and part_name from part_base_path + String uuid = extractUUID(part_base_path); + String part_name = extractPartName(part_base_path); + + if (type == DataType::ALL || type == DataType::META) + meta_path = fs::path(latest_disk_cache_dir) / getRelativePathForPart(uuid, part_name, META_DISK_CACHE_DIR_PREFIX); + if (type == DataType::ALL || type == DataType::DATA) + data_path = fs::path(latest_disk_cache_dir) / getRelativePathForPart(uuid, part_name, DATA_DISK_CACHE_DIR_PREFIX); + } + + LOG_TRACE(log, "Dropping cache for part {} (meta: {}, data: {})", part_base_path, meta_path.string(), data_path.string()); + + const Disks & disks = volume->getDisks(); + size_t delete_file_size = 0; + + for (const auto & disk : disks) + { + if (!meta_path.empty() && disk->exists(meta_path)) + { + DiskCacheDeleter deleter(*this, disk, 1, -1, -1); + deleter.exec(meta_path); + delete_file_size += deleter.delete_file_size; + } + + if (!data_path.empty() && disk->exists(data_path)) + { + DiskCacheDeleter deleter(*this, disk, 1, -1, -1); + deleter.exec(data_path); + delete_file_size += deleter.delete_file_size; + } + } + + if (part_base_path.empty()) + { + size_t dropped_bytes = total_size.load(); + for (auto & shard : shards) + { + std::unique_lock lock(shard.mutex); + shard.cache_map.clear(); + shard.part_index.clear(); + } + total_entries.store(0); + total_size.store(0); + DiskCacheFactory::instance().releaseGlobalTTL(dropped_bytes); + + std::unique_lock lk(cache_stats.partition_stats_mutex); + cache_stats.partition_stats.clear(); + } + else + { + String part_name_only = extractPartName(part_base_path); + UInt64 hash_high = sipHash64(part_name_only.data(), part_name_only.size()); + + auto & shard = getShard(hash_high); + CacheEraseResult result; + { + std::unique_lock lock(shard.mutex); + result = cacheErasePartLocked(shard, hash_high); + if (result.count > 0) + { + total_entries -= result.count; + total_size -= result.bytes; + DiskCacheFactory::instance().releaseGlobalTTL(result.bytes); + } + } + if (result.count > 0) + { + subtractFromPartitionStats(result); + if (fdb_index) + fdb_index->evictPart(result.partition_id, result.hash_high); + } + } + + LOG_TRACE(log, "Dropped {} bytes of cache for part {}", delete_file_size, part_base_path); + return delete_file_size; +} + +// DiskIterator implementations +DiskCacheTTL::DiskIterator::DiskIterator( + const String & name_, DiskCacheTTL & cache_, DiskPtr disk_, size_t worker_per_disk_, int min_depth_parallel_, int max_depth_parallel_) + : name(name_), disk_cache(cache_), disk(disk_), worker_per_disk(worker_per_disk_), + min_depth_parallel(min_depth_parallel_), max_depth_parallel(max_depth_parallel_) +{ + log = &Poco::Logger::get(name); + + if (worker_per_disk > 1) + pool = std::make_unique(worker_per_disk); +} + +void DiskCacheTTL::DiskIterator::exec(std::filesystem::path entry_path) +{ + iterateDirectory(entry_path, 0); + + if (pool) + pool->wait(); +} + +void DiskCacheTTL::DiskIterator::iterateDirectory(std::filesystem::path rel_path, size_t depth) +{ + if (!disk->exists(rel_path)) + return; + + for (auto it = disk->iterateDirectory(rel_path); it->isValid(); it->next()) + { + auto entry_path = rel_path / it->name(); + + if (disk->isDirectory(entry_path)) + { + iterateDirectory(entry_path, depth + 1); + } + else if (disk->isFile(entry_path)) + { + iterateFile(entry_path, disk->getFileSize(entry_path)); + } + } +} + +// DiskCacheLoader +DiskCacheTTL::DiskCacheLoader::DiskCacheLoader( + DiskCacheTTL & cache_, DiskPtr disk_, size_t worker_per_disk_, int min_depth_parallel_, int max_depth_parallel_) + : DiskIterator("DiskCacheTTLLoader", cache_, disk_, worker_per_disk_, min_depth_parallel_, max_depth_parallel_) +{ +} + +DiskCacheTTL::DiskCacheLoader::~DiskCacheLoader() +{ +} + +void DiskCacheTTL::DiskCacheLoader::iterateFile(std::filesystem::path file_path, size_t file_size) +{ + String filename = file_path.filename(); + + // Skip temp files + if (endsWith(filename, DISK_CACHE_TEMP_FILE_SUFFIX)) + { + disk->removeFileIfExists(file_path); + return; + } + + // Skip and clean up 0-byte files — they indicate an interrupted or empty write + // and would cause false cache HITs returning empty content. + if (file_size == 0) + { + disk->removeFileIfExists(file_path); + return; + } + + // Path structure: {cache_dir}/{data|meta}/{uuid}/{partition}/{3char}/{hash_high}/{hash_low} + // The filename is hash_low (low 64 bits of the key) and the parent dir is hash_high. + // Parse each half as a hex UInt64 to reconstruct the full UInt128 key. + if (!isHexHalf(filename)) + { + LOG_WARNING(log, "Invalid cache file (hash_low): {}", file_path.string()); + return; + } + UInt64 low = unhexUInt(filename.data()); + + // New structure: data/uuid/partition/3char/hash_high/hash_low + // Extract partition from path hierarchy + auto hash_high_dir = file_path.parent_path().filename().string(); // hash_high + auto partition_dir = file_path.parent_path().parent_path().parent_path().filename().string(); // partition_id + + if (!isHexHalf(hash_high_dir)) + { + LOG_WARNING(log, "Invalid cache directory (hash_high): {}", file_path.string()); + return; + } + UInt64 high = unhexUInt(hash_high_dir.data()); + + // Build full key matching UInt128{high, low} as returned by unhexKey + UInt128 key = {high, low}; + + // Parse timestamp from partition_id (e.g., "20240315") + time_t part_ts = 0; + if (partition_dir.size() >= 8 && std::all_of(partition_dir.begin(), partition_dir.end(), ::isdigit)) + { + try + { + int year = std::stoi(partition_dir.substr(0, 4)); + int month = std::stoi(partition_dir.substr(4, 2)); + int day = std::stoi(partition_dir.substr(6, 2)); + + struct tm tm_info = {}; + tm_info.tm_year = year - 1900; + tm_info.tm_mon = month - 1; + tm_info.tm_mday = day; + tm_info.tm_isdst = -1; + + part_ts = mktime(&tm_info); + } + catch (...) + { + LOG_WARNING(log, "Failed to parse partition timestamp from: {}", partition_dir); + } + } + + // Skip expired or non-time-based segments; delete the stale file so it + // doesn't accumulate on disk across restarts. + if (!disk_cache.shouldCache(part_ts)) + { + disk->removeFileIfExists(file_path); + return; + } + + String file_path_str = file_path.string(); + { + auto & shard = disk_cache.getShard(high); + std::unique_lock lock(shard.mutex); + auto meta = std::make_shared( + DiskCacheTTLMeta::State::Cached, disk, file_size, time(nullptr), part_ts, std::move(file_path_str) + ); + disk_cache.cacheInsertLocked(shard, key, meta, partition_dir); + disk_cache.total_entries++; + disk_cache.total_size += file_size; + DiskCacheFactory::instance().addGlobalTTLUsage(file_size); + } + + // Update partition stats outside shard mutex to avoid lock ordering with partition_stats_mutex + disk_cache.addToPartitionStats(partition_dir, part_ts, file_size); + disk_cache.cache_stats.cached_from_restored++; + disk_cache.cache_stats.cached_bytes_restored += file_size; + + total_loaded++; +} + +// DiskCacheMigrator (stub) +DiskCacheTTL::DiskCacheMigrator::DiskCacheMigrator( + DiskCacheTTL & cache_, DiskPtr disk_, size_t worker_per_disk_, int min_depth_parallel_, int max_depth_parallel_) + : DiskIterator("DiskCacheTTLMigrator", cache_, disk_, worker_per_disk_, min_depth_parallel_, max_depth_parallel_) +{ +} + +DiskCacheTTL::DiskCacheMigrator::~DiskCacheMigrator() +{ +} + +void DiskCacheTTL::DiskCacheMigrator::iterateFile(std::filesystem::path, size_t) +{ +} + +// DiskCacheDeleter (stub) +DiskCacheTTL::DiskCacheDeleter::DiskCacheDeleter( + DiskCacheTTL & cache_, DiskPtr disk_, size_t worker_per_disk_, int min_depth_parallel_, int max_depth_parallel_) + : DiskIterator("DiskCacheTTLDeleter", cache_, disk_, worker_per_disk_, min_depth_parallel_, max_depth_parallel_) +{ +} + +DiskCacheTTL::DiskCacheDeleter::~DiskCacheDeleter() +{ +} + +void DiskCacheTTL::DiskCacheDeleter::exec(std::filesystem::path entry_path) +{ + disk->removeRecursive(entry_path); +} + +void DiskCacheTTL::DiskCacheDeleter::iterateFile(std::filesystem::path, size_t) +{ +} + +DiskCacheTTL::TTLCacheStats DiskCacheTTL::getStats() const +{ + TTLCacheStats stats; + stats.table_uuid = cache_stats.table_uuid; + stats.total_entries = total_entries.load(); + stats.total_bytes = total_size.load(); + stats.evicted_expired = cache_stats.evicted_expired.load(); + stats.evicted_size_limit = cache_stats.evicted_size_limit.load(); + stats.rejected_non_time_partition = cache_stats.rejected_non_time_partition.load(); + stats.rejected_too_old = cache_stats.rejected_too_old.load(); + stats.last_eviction_run = cache_stats.last_eviction_run.load(); + stats.async_eviction_triggered = cache_stats.async_eviction_triggered.load(); + stats.async_eviction_skipped_rate_limit = cache_stats.async_eviction_skipped_rate_limit.load(); + stats.cached_from_preload = cache_stats.cached_from_preload.load(); + stats.cached_from_query = cache_stats.cached_from_query.load(); + stats.cached_bytes_preload = cache_stats.cached_bytes_preload.load(); + stats.cached_bytes_query = cache_stats.cached_bytes_query.load(); + stats.cached_from_restored = cache_stats.cached_from_restored.load(); + stats.cached_bytes_restored = cache_stats.cached_bytes_restored.load(); + stats.cached_idx_from_preload = cache_stats.cached_idx_from_preload.load(); + stats.cached_idx_bytes_preload = cache_stats.cached_idx_bytes_preload.load(); + stats.cached_idx_from_query = cache_stats.cached_idx_from_query.load(); + stats.cached_idx_bytes_query = cache_stats.cached_idx_bytes_query.load(); + stats.data_hits = cache_stats.data_hits.load(); + stats.data_misses = cache_stats.data_misses.load(); + stats.idx_hits = cache_stats.idx_hits.load(); + stats.idx_misses = cache_stats.idx_misses.load(); + stats.total_hits = stats.data_hits + stats.idx_hits; + stats.total_misses = stats.data_misses + stats.idx_misses; + return stats; +} + +std::vector DiskCacheTTL::getPartitionStats() const +{ + std::vector result; + std::shared_lock lock(cache_stats.partition_stats_mutex); + result.reserve(cache_stats.partition_stats.size()); + for (const auto & [partition_id, internal_stats] : cache_stats.partition_stats) + { + PartitionStats snapshot; + snapshot.partition_id = internal_stats.partition_id; + snapshot.entry_count = internal_stats.entry_count.load(); + snapshot.total_bytes = internal_stats.total_bytes.load(); + snapshot.partition_timestamp = internal_stats.partition_timestamp; + result.push_back(snapshot); + } + return result; +} + +std::optional DiskCacheTTL::findPeerOwner(const String & seg_name) +{ + if (!fdb_index) + return std::nullopt; + + auto key = hash(seg_name); + String part_name = extractPartName(seg_name); + String partition_id = extractPartitionId(part_name); + + auto maybe_worker_id = fdb_index->findPeerOwner(key, partition_id); + if (!maybe_worker_id) + return std::nullopt; + + return DiskCacheFactory::instance().resolveWorkerEndpoint(*maybe_worker_id); +} + +} + diff --git a/src/Storages/DiskCache/DiskCacheTTL.h b/src/Storages/DiskCache/DiskCacheTTL.h new file mode 100644 index 00000000000..7d35c791a27 --- /dev/null +++ b/src/Storages/DiskCache/DiskCacheTTL.h @@ -0,0 +1,349 @@ +/* + * Copyright (2022) Bytedance Ltd. and/or its affiliates + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +class TTLCacheFDBIndex; + +class DiskCacheTTLMeta +{ +public: + enum class State + { + Caching, + Cached, + Deleting, + }; + + DiskCacheTTLMeta(State state_, const DiskPtr & disk_, size_t size_, time_t cached_at_, time_t max_ts_, String rel_path_ = {}) + : state(state_), disk(disk_), size(size_), cached_at(cached_at_), max_timestamp(max_ts_), rel_path(std::move(rel_path_)) + {} + + State state; + DiskPtr disk; + size_t size; + time_t cached_at; + time_t max_timestamp; + String rel_path; // exact on-disk relative path; avoids reconstructing prefix (data/ vs meta/) at eviction time +}; + +struct DiskCacheTTLWeightFunction +{ + size_t operator()(const DiskCacheTTLMeta& meta) const + { + if (meta.state == DiskCacheTTLMeta::State::Cached) + return meta.size; + return 0; + } +}; + +/// TTL-based disk cache +/// Evicts parts based on partition timestamp and retention window +/// Parallel to DiskCacheLRU +class DiskCacheTTL: public IDiskCache +{ +public: + using KeyType = UInt128; + + DiskCacheTTL( + const String & name_, + const String & table_uuid_, + const VolumePtr & volume, + const ThrottlerPtr & throttler, + const DiskCacheSettings & settings, + const IDiskCacheStrategyPtr & strategy_, + UInt64 ttl_minutes_, + size_t max_size_bytes_ = 0, // 0 = use settings.ttl_cache_max_size + IDiskCache::DataType type_ = IDiskCache::DataType::ALL); + + void set(const String& seg_name, ReadBuffer& value, size_t weight_hint, bool is_preload, time_t min_time = 0, time_t max_time = 0) override; + std::pair get(const String& seg_name) override; + void load() override; + size_t drop(const String & part_name) override; + + size_t getKeyCount() const override { return total_entries.load(); } + size_t getCachedSize() const override { return total_size.load(); } + std::filesystem::path getRelativePath(const KeyType & key, const String & seg_name, const String & prefix = {}) { return getPath(key, latest_disk_cache_dir, seg_name, prefix);} + + std::filesystem::path getPath(const KeyType & key, const String & path, const String & seg_name, const String & prefix) const; + + static KeyType hash(const String & seg_name); + static String hexKey(const KeyType & key); + + void evictExpired(); + void evictOldestPartitionsUntilSpace(size_t needed_bytes); + static std::optional unhexKey(const String & hex); + + /// Parse partition timestamp from part name + /// Returns 0 if partition is not time-based + static time_t parsePartitionTimestamp(const String & part_name); + + // Stats structures for observability + + // Internal stats with atomics (not copyable) + struct PartitionStatsInternal + { + String partition_id; + std::atomic entry_count{0}; + std::atomic total_bytes{0}; + time_t partition_timestamp{0}; + }; + + // Snapshot for return (plain types, copyable) + struct PartitionStats + { + String partition_id; + size_t entry_count{0}; + size_t total_bytes{0}; + time_t partition_timestamp{0}; + }; + + // Snapshot for return (plain types, copyable) + struct TTLCacheStats + { + String table_uuid; + size_t total_entries{0}; + size_t total_bytes{0}; + + // TTL-specific counters + size_t evicted_expired{0}; + size_t evicted_size_limit{0}; + size_t rejected_non_time_partition{0}; + size_t rejected_too_old{0}; + time_t last_eviction_run{0}; + + // Async size-based eviction stats + size_t async_eviction_triggered{0}; + size_t async_eviction_skipped_rate_limit{0}; + + // Write source breakdown (preload vs query-triggered vs restored from FDB on startup) + size_t cached_from_preload{0}; + size_t cached_from_query{0}; + size_t cached_bytes_preload{0}; + size_t cached_bytes_query{0}; + size_t cached_from_restored{0}; + size_t cached_bytes_restored{0}; + // Skip-index write breakdown (same events, idx extension only) + size_t cached_idx_from_preload{0}; + size_t cached_idx_bytes_preload{0}; + size_t cached_idx_from_query{0}; + size_t cached_idx_bytes_query{0}; + + // Aggregated hit/miss counts across all partitions, by segment type + size_t total_hits{0}; + size_t total_misses{0}; + size_t data_hits{0}; + size_t data_misses{0}; + size_t idx_hits{0}; + size_t idx_misses{0}; + }; + + // Internal stats with atomics + struct TTLCacheStatsInternal + { + String table_uuid; + + // TTL-specific counters + std::atomic evicted_expired{0}; + std::atomic evicted_size_limit{0}; + std::atomic rejected_non_time_partition{0}; + std::atomic rejected_too_old{0}; + std::atomic last_eviction_run{0}; + + // Async size-based eviction stats + std::atomic async_eviction_triggered{0}; + std::atomic async_eviction_skipped_rate_limit{0}; + + // Write source breakdown (preload vs query-triggered vs restored from FDB on startup) + std::atomic cached_from_preload{0}; + std::atomic cached_from_query{0}; + std::atomic cached_bytes_preload{0}; + std::atomic cached_bytes_query{0}; + std::atomic cached_from_restored{0}; + std::atomic cached_bytes_restored{0}; + // Skip-index write breakdown + std::atomic cached_idx_from_preload{0}; + std::atomic cached_idx_bytes_preload{0}; + std::atomic cached_idx_from_query{0}; + std::atomic cached_idx_bytes_query{0}; + + // Aggregated hit/miss by segment type + std::atomic data_hits{0}; + std::atomic data_misses{0}; + std::atomic idx_hits{0}; + std::atomic idx_misses{0}; + + // Per-partition breakdown + mutable std::shared_mutex partition_stats_mutex; + std::unordered_map partition_stats; + }; + + TTLCacheStats getStats() const; + std::vector getPartitionStats() const; + + UInt64 getTTLMinutes() const { return ttl_minutes.load(std::memory_order_relaxed); } + size_t getMaxSizeBytes() const { return max_size_bytes.load(std::memory_order_relaxed); } + void setFDBIndex(std::shared_ptr idx) { fdb_index = std::move(idx); } + + void updateSettings(UInt64 new_ttl_minutes, size_t new_max_size_bytes) + { + ttl_minutes.store(new_ttl_minutes, std::memory_order_relaxed); + max_size_bytes.store(new_max_size_bytes, std::memory_order_relaxed); + } + + /// Look up whether a peer worker has this segment cached via the FDB reverse index. + /// Returns peer RPC endpoint if found, nullopt if not found or FDB unavailable. + /// Gated on fdb_index being set; caller is responsible for checking stealing mode. + std::optional findPeerOwner(const String & seg_name); + +private: + struct CacheEraseResult { + String partition_id; + time_t partition_ts{0}; + UInt64 hash_high{0}; + size_t count{0}; + size_t bytes{0}; + std::vector> files; + }; + + struct PartIndexEntry { + String partition_id; + time_t partition_ts{0}; + std::unordered_set keys; + size_t total_bytes{0}; + }; + + size_t writeSegment(ReadBuffer& buffer, ReservationPtr& reservation, const String& cache_rel_path); + bool shouldCache(time_t part_ts) const; + + static constexpr size_t NUM_SHARDS = 64; + + struct Shard { + mutable std::shared_mutex mutex; + std::unordered_map, UInt128Hash> cache_map; + std::unordered_map part_index; + }; + + Shard & getShard(UInt64 hash_high) { return shards[hash_high & (NUM_SHARDS - 1)]; } + + /// Structural helpers — caller must hold shard.mutex + void cacheInsertLocked(Shard & shard, KeyType key, std::shared_ptr meta, const String & precomputed_partition_id = {}); + CacheEraseResult cacheEraseLocked(Shard & shard, KeyType key); + CacheEraseResult cacheErasePartLocked(Shard & shard, UInt64 hash_high); + + /// Stats helpers — caller must NOT hold any shard mutex + void addToPartitionStats(const String & partition_id, time_t partition_ts, size_t bytes, size_t count = 1); + void subtractFromPartitionStats(const CacheEraseResult & result); + + /// Apply a batch of erase results: delete files, notify FDB, update partition stats. + /// Caller must NOT hold any shard mutex. Increments total_evicted by result.count for each entry. + void applyEraseResults(std::vector & results, size_t & total_evicted, const char * log_tag); + + + struct DiskIterator : private boost::noncopyable + { + explicit DiskIterator( + const String & name_, DiskCacheTTL & cache_, DiskPtr disk_, size_t worker_per_disk_, int min_depth_parallel_, int max_depth_parallel_); + virtual ~DiskIterator() = default; + + virtual void exec(std::filesystem::path entry_path); + virtual void iterateDirectory(std::filesystem::path rel_path, size_t depth); + virtual void iterateFile(std::filesystem::path file_path, size_t file_size) = 0; + + String name; + DiskCacheTTL & disk_cache; + DiskPtr disk; + size_t worker_per_disk{1}; + int min_depth_parallel{-1}; + int max_depth_parallel{-1}; + std::unique_ptr pool; + ExceptionHandler handler; + Poco::Logger * log; + }; + + struct DiskCacheLoader : DiskIterator + { + explicit DiskCacheLoader( + DiskCacheTTL & cache_, DiskPtr disk_, size_t worker_per_disk, int min_depth_parallel, int max_depth_parallel); + ~DiskCacheLoader() override; + void iterateFile(std::filesystem::path file_path, size_t file_size) override; + + std::atomic_size_t total_loaded = 0; + }; + + struct DiskCacheMigrator : DiskIterator + { + explicit DiskCacheMigrator( + DiskCacheTTL & cache_, DiskPtr disk_, size_t worker_per_disk, int min_depth_parallel, int max_depth_parallel); + ~DiskCacheMigrator() override; + void iterateFile(std::filesystem::path file_path, size_t file_size) override; + + std::atomic_size_t total_migrated = 0; + }; + + struct DiskCacheDeleter : DiskIterator + { + explicit DiskCacheDeleter( + DiskCacheTTL & cache_, DiskPtr disk_, size_t worker_per_disk, int min_depth_parallel, int max_depth_parallel); + ~DiskCacheDeleter() override; + void exec(std::filesystem::path entry_path) override; + void iterateFile(std::filesystem::path file_path, size_t file_size) override; + + size_t delete_file_size {0}; + }; + + /// FDB-backed index for fast startup recovery + /// optional — null if catalog unavailable + std::shared_ptr fdb_index; + + ThrottlerPtr set_rate_throttler; + ThrottlerPtr set_throughput_throttler; + std::atomic is_droping{false}; + + const String table_uuid; + std::atomic ttl_minutes; + std::atomic max_size_bytes; // 0 = unlimited + + std::array shards; + std::atomic total_entries{0}; + std::atomic total_size{0}; + + /// Last eviction check time + std::atomic last_eviction_check{0}; + + /// Last async size-based eviction trigger time + std::atomic last_size_eviction_trigger{0}; + + /// Cache statistics + TTLCacheStatsInternal cache_stats; +}; + +} diff --git a/src/Storages/DiskCache/IDiskCache.cpp b/src/Storages/DiskCache/IDiskCache.cpp index 91e20554830..1e22915ecb2 100644 --- a/src/Storages/DiskCache/IDiskCache.cpp +++ b/src/Storages/DiskCache/IDiskCache.cpp @@ -69,9 +69,9 @@ void IDiskCache::init(const Context & global_context) false /*throw_on_exception*/); local_disk_cache_preload_thread_pool = std::make_unique( - settings.cnch_parallel_preloading, - settings.cnch_parallel_preloading, - settings.cnch_parallel_preloading * 100, + settings.local_disk_cache_preload_thread_pool_size, + settings.local_disk_cache_preload_thread_pool_size, + settings.local_disk_cache_preload_thread_pool_size * 100, false /*throw_on_exception*/); } diff --git a/src/Storages/DiskCache/IDiskCache.h b/src/Storages/DiskCache/IDiskCache.h index 3756560dc84..b73a450a1f2 100644 --- a/src/Storages/DiskCache/IDiskCache.h +++ b/src/Storages/DiskCache/IDiskCache.h @@ -82,7 +82,8 @@ class IDiskCache : public std::enable_shared_from_this virtual void shutdown(); /// set segment name in cache and write value to disk cache - virtual void set(const String & key, ReadBuffer & value, size_t weight_hint, bool is_preload) = 0; + /// min_time/max_time: optional timestamps from part data (0 = not provided, will parse from partition_id for TTL cache) + virtual void set(const String & key, ReadBuffer & value, size_t weight_hint, bool is_preload, time_t min_time = 0, time_t max_time = 0) = 0; /// get segment from cache and return local path if exists. virtual std::pair get(const String & key) = 0; @@ -200,7 +201,7 @@ class MultiDiskCache : public IDiskCache return dropped_size; } - virtual void set(const String &, ReadBuffer &, size_t, bool ) override { throw Exception("MultiDiskCache `set` is not supported now", ErrorCodes::LOGICAL_ERROR);} + virtual void set(const String &, ReadBuffer &, size_t, bool, time_t = 0, time_t = 0) override { throw Exception("MultiDiskCache `set` is not supported now", ErrorCodes::LOGICAL_ERROR);} virtual std::pair get(const String &) override { throw Exception("MultiDiskCache `get` is not supported now", ErrorCodes::LOGICAL_ERROR);} virtual void load() override { throw Exception("MultiDiskCache `load` is not supported now", ErrorCodes::LOGICAL_ERROR);} virtual size_t getKeyCount() const override {throw Exception("MultiDiskCache `getKeyCount` is not supported now", ErrorCodes::LOGICAL_ERROR); } diff --git a/src/Storages/DiskCache/PartFileDiskCacheSegment.cpp b/src/Storages/DiskCache/PartFileDiskCacheSegment.cpp index b2c895ffecb..ccef92ab2dc 100644 --- a/src/Storages/DiskCache/PartFileDiskCacheSegment.cpp +++ b/src/Storages/DiskCache/PartFileDiskCacheSegment.cpp @@ -89,7 +89,10 @@ PartFileDiskCacheSegment::PartFileDiskCacheSegment( mrk_file_pos.file_size, merge_tree_reader_settings, 1, - mark_disk_cache_) + mark_disk_cache_, + {}, + data_part_->storage.getStorageUUID(), + data_part_->getUniquePartName()) { } @@ -180,11 +183,14 @@ void PartFileDiskCacheSegment::cacheToDisk(IDiskCache & disk_cache, bool throw_e != DiskCacheMode:: FORCE_STEAL_DISK_CACHE) // FORCE_STEAL_DISK_CACHE is used for testing, which only allow remote cache request so will skip local cache write { + // Get min/max time from part for TTL cache granularity + auto [min_time, max_time] = data_part->getMinMaxTime(); + if (!preload_level || (preload_level & PreloadLevelSettings::DataPreload) == PreloadLevelSettings::DataPreload) { data_file->seek(stream_file_pos.file_offset + cache_data_left_offset); LimitReadBuffer segment_value(*data_file, cache_data_bytes, false); - disk_cache.getDataCache()->set(getSegmentName(), segment_value, cache_data_bytes, preload_level > 0); + disk_cache.getDataCache()->set(getSegmentName(), segment_value, cache_data_bytes, preload_level > 0, min_time, max_time); LOG_TRACE(disk_cache.getLogger(), "Cached part{} data file: {}, preload_level: {}", extension, getSegmentName(), preload_level); } @@ -194,7 +200,7 @@ void PartFileDiskCacheSegment::cacheToDisk(IDiskCache & disk_cache, bool throw_e data_file->seek(mrk_file_pos.file_offset); LimitReadBuffer marks_value(*data_file, mrk_file_pos.file_size, false); String marks_key = getMarkName(); - disk_cache.getMetaCache()->set(marks_key, marks_value, mrk_file_pos.file_size, preload_level > 0); + disk_cache.getMetaCache()->set(marks_key, marks_value, mrk_file_pos.file_size, preload_level > 0, min_time, max_time); LOG_TRACE(disk_cache.getLogger(), "Cached part{} mark file: {}, preload_level: {}", extension, marks_key, preload_level); } diff --git a/src/Storages/DiskCache/PreloadRegistry.cpp b/src/Storages/DiskCache/PreloadRegistry.cpp new file mode 100644 index 00000000000..a7ff14a27af --- /dev/null +++ b/src/Storages/DiskCache/PreloadRegistry.cpp @@ -0,0 +1,71 @@ +#include "PreloadRegistry.h" + +namespace DB +{ + +PreloadRegistry & PreloadRegistry::instance() +{ + static PreloadRegistry inst; + return inst; +} + +void PreloadRegistry::registerParts( + const String & table_name, + const String & table_uuid, + const String & partition_id, + size_t parts_count, + UInt64 preload_level) +{ + if (parts_count == 0) + return; + + Key key{table_uuid, partition_id}; + std::lock_guard lock(mu); + auto it = entries.find(key); + if (it == entries.end()) + { + auto entry = std::make_shared(table_name, table_uuid, partition_id, parts_count, preload_level); + entry->parts_in_flight.store(parts_count, std::memory_order_relaxed); + entries.emplace(key, std::move(entry)); + } + else + { + it->second->parts_submitted += parts_count; + it->second->parts_in_flight.fetch_add(parts_count, std::memory_order_relaxed); + } +} + +void PreloadRegistry::partFinished(const String & table_uuid, const String & partition_id) +{ + Key key{table_uuid, partition_id}; + std::lock_guard lock(mu); + auto it = entries.find(key); + if (it == entries.end()) + return; + if (it->second->parts_in_flight.fetch_sub(1, std::memory_order_acq_rel) == 1) + entries.erase(it); +} + +std::vector PreloadRegistry::getSnapshot() const +{ + std::lock_guard lock(mu); + std::vector result; + result.reserve(entries.size()); + auto now = std::chrono::steady_clock::now(); + for (const auto & [_, e] : entries) + { + auto elapsed = std::chrono::duration_cast(now - e->start_time).count(); + result.push_back({ + e->table_name, + e->table_uuid, + e->partition_id, + e->parts_in_flight.load(std::memory_order_relaxed), + e->parts_submitted, + static_cast(elapsed), + e->preload_level, + }); + } + return result; +} + +} diff --git a/src/Storages/DiskCache/PreloadRegistry.h b/src/Storages/DiskCache/PreloadRegistry.h new file mode 100644 index 00000000000..b46616d5ae3 --- /dev/null +++ b/src/Storages/DiskCache/PreloadRegistry.h @@ -0,0 +1,83 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace DB +{ + +struct PreloadEntry +{ + String table_name; + String table_uuid; + String partition_id; + std::atomic parts_in_flight{0}; + size_t parts_submitted{0}; + std::chrono::steady_clock::time_point start_time; + UInt64 preload_level{0}; + + PreloadEntry(String tn, String uuid, String pid, size_t submitted, UInt64 level) + : table_name(std::move(tn)) + , table_uuid(std::move(uuid)) + , partition_id(std::move(pid)) + , parts_submitted(submitted) + , start_time(std::chrono::steady_clock::now()) + , preload_level(level) + { + } + + // non-copyable due to atomics + PreloadEntry(const PreloadEntry &) = delete; + PreloadEntry & operator=(const PreloadEntry &) = delete; +}; + +struct PreloadPartitionSnapshot +{ + String table_name; + String table_uuid; + String partition_id; + size_t parts_in_flight; + size_t parts_submitted; + UInt64 elapsed_ms; + UInt64 preload_level; +}; + +/// Global registry tracking in-flight async preload tasks, grouped by (table_uuid, partition_id). +class PreloadRegistry +{ +public: + static PreloadRegistry & instance(); + + /// Register parts_count tasks for a partition. Returns a handle whose destructor + /// decrements the counter (call once per part from within the task lambda). + /// The entry is removed automatically when parts_in_flight drops to zero. + void registerParts(const String & table_name, const String & table_uuid, + const String & partition_id, size_t parts_count, UInt64 preload_level); + + /// Decrement in-flight count for a partition. Removes entry when it reaches zero. + void partFinished(const String & table_uuid, const String & partition_id); + + std::vector getSnapshot() const; + +private: + using Key = std::pair; // (table_uuid, partition_id) + struct PairHash + { + size_t operator()(const Key & k) const + { + size_t h = std::hash{}(k.first); + h ^= std::hash{}(k.second) + 0x9e3779b9 + (h << 6) + (h >> 2); + return h; + } + }; + + mutable std::mutex mu; + std::unordered_map, PairHash> entries; +}; + +} diff --git a/src/Storages/DiskCache/TTLCacheFDBIndex.cpp b/src/Storages/DiskCache/TTLCacheFDBIndex.cpp new file mode 100644 index 00000000000..5b86a05b8a2 --- /dev/null +++ b/src/Storages/DiskCache/TTLCacheFDBIndex.cpp @@ -0,0 +1,327 @@ +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace DB +{ + + +TTLCacheFDBIndex::TTLCacheFDBIndex( + std::shared_ptr metastore_, + const String & name_space, + const String & worker_id, + const String & table_uuid, + const String & own_endpoint_) + : metastore(std::move(metastore_)) + , key_prefix(Catalog::escapeString(name_space) + "_DCI_" + Catalog::escapeString(worker_id) + "_" + table_uuid) + , rev_key_prefix(Catalog::escapeString(name_space) + "_DCIREV_" + table_uuid) + , own_worker_id(own_endpoint_) + , log(&Poco::Logger::get("TTLCacheFDBIndex")) +{ + bg = std::thread([this] { bgLoop(); }); +} + +TTLCacheFDBIndex::~TTLCacheFDBIndex() +{ + { + std::lock_guard lk(mu); + stopped = true; + } + cv.notify_all(); + if (bg.joinable()) + bg.join(); +} + +String TTLCacheFDBIndex::makeSegKey(UInt128 key, const String & partition_id) const +{ + return key_prefix + "_" + partition_id + "_" + getHexUIntLowercase(key.items[0]) + "_" + getHexUIntLowercase(key.items[1]); +} + +String TTLCacheFDBIndex::makePartPrefix(const String & partition_id, UInt64 hash_high) const +{ + return key_prefix + "_" + partition_id + "_" + getHexUIntLowercase(hash_high) + "_"; +} + +String TTLCacheFDBIndex::makeRevKey(UInt128 key, const String & partition_id) const +{ + return rev_key_prefix + "_" + partition_id + "_" + getHexUIntLowercase(key.items[0]) + "_" + getHexUIntLowercase(key.items[1]); +} + +String TTLCacheFDBIndex::makeRevPartPrefix(const String & partition_id, UInt64 hash_high) const +{ + return rev_key_prefix + "_" + partition_id + "_" + getHexUIntLowercase(hash_high) + "_"; +} + +String TTLCacheFDBIndex::encodeValue(const String & seg_name, size_t size, time_t part_ts) +{ + // Format: "part_ts:size:seg_name" + // seg_name uses '/' as separator internally, no ':' — safe delimiter + return fmt::format("{}:{}:{}", static_cast(part_ts), size, seg_name); +} + +bool TTLCacheFDBIndex::decodeValue(const String & raw, String & seg_name, size_t & size, time_t & part_ts) +{ + auto p1 = raw.find(':'); + if (p1 == String::npos) + return false; + auto p2 = raw.find(':', p1 + 1); + if (p2 == String::npos) + return false; + + try + { + part_ts = static_cast(std::stoll(raw.substr(0, p1))); + size = static_cast(std::stoull(raw.substr(p1 + 1, p2 - p1 - 1))); + seg_name = raw.substr(p2 + 1); + return !seg_name.empty(); + } + catch (...) { return false; } +} + +void TTLCacheFDBIndex::onSet(UInt128 key, const String & seg_name, size_t size, time_t part_ts) +{ + // partition_id is the YYYYMMDD component of the file path, derived from part_ts + struct tm t{}; + gmtime_r(&part_ts, &t); + String partition_id = fmt::format("{:04d}{:02d}{:02d}", t.tm_year + 1900, t.tm_mon + 1, t.tm_mday); + + PendingOp fwd; + fwd.type = PendingOp::Type::Set; + fwd.key = makeSegKey(key, partition_id); + fwd.value = encodeValue(seg_name, size, part_ts); + + PendingOp rev; + rev.type = PendingOp::Type::Set; + rev.key = makeRevKey(key, partition_id); + rev.value = own_worker_id; + + { + std::lock_guard lk(mu); + queue.push_back(std::move(fwd)); + queue.push_back(std::move(rev)); + } + cv.notify_one(); +} + +void TTLCacheFDBIndex::evictPart(const String & partition_id, UInt64 hash_high) +{ + PendingOp fwd; + fwd.type = PendingOp::Type::Evict; + fwd.key = makePartPrefix(partition_id, hash_high); + + PendingOp rev; + rev.type = PendingOp::Type::Evict; + rev.key = makeRevPartPrefix(partition_id, hash_high); + + { + std::lock_guard lk(mu); + queue.push_back(std::move(fwd)); + queue.push_back(std::move(rev)); + } + cv.notify_one(); +} + +void TTLCacheFDBIndex::bgLoop() +{ + while (true) + { + std::vector batch; + { + std::unique_lock lk(mu); + cv.wait_for(lk, std::chrono::milliseconds(MAX_WAIT_MS), + [this] { return stopped || queue.size() >= BATCH_SIZE; }); + + if (stopped && queue.empty()) + return; + + size_t n = std::min(queue.size(), BATCH_SIZE); + batch.reserve(n); + for (size_t i = 0; i < n; ++i) + { + batch.push_back(std::move(queue.front())); + queue.pop_front(); + } + } + + if (!batch.empty()) + flush(batch); + } +} + +void TTLCacheFDBIndex::flush(std::vector & ops) +{ + // Split: sets go through batchWrite, evicts go through clean() individually + Catalog::BatchCommitRequest batch; + for (auto & op : ops) + { + if (op.type == PendingOp::Type::Set) + batch.AddPut(Catalog::SinglePutRequest(op.key, op.value)); + } + + if (!batch.puts.empty()) + { + try + { + Catalog::BatchCommitResponse resp; + metastore->batchWrite(batch, resp); + } + catch (...) + { + tryLogCurrentException(log, "TTLCacheFDBIndex: batch write failed"); + } + } + + for (auto & op : ops) + { + if (op.type == PendingOp::Type::Evict) + { + try { metastore->clean(op.key); } + catch (...) { tryLogCurrentException(log, "TTLCacheFDBIndex: clean failed for " + op.key); } + } + } +} + +std::optional TTLCacheFDBIndex::findPeerOwner(UInt128 key, const String & partition_id) +{ + String rev_key = makeRevKey(key, partition_id); + String endpoint; + try + { + if (metastore->get(rev_key, endpoint) == 0) + return std::nullopt; // key not found + } + catch (...) + { + tryLogCurrentException(log, "TTLCacheFDBIndex: findPeerOwner FDB get failed"); + return std::nullopt; + } + + // endpoint now holds the peer's worker_id; skip if it's ourselves + if (endpoint.empty() || endpoint == own_worker_id) + return std::nullopt; + + return endpoint; // caller resolves worker_id → host:port via DiskCacheFactory +} + +std::optional> TTLCacheFDBIndex::reconcile( + const VolumePtr & volume, + std::function get_rel_path, + std::function should_cache, + std::function on_reconcile_batch, + std::function on_stats_update) +{ + // Page through FDB in chunks to avoid hitting the 5-second transaction timeout + // that occurs when scanning millions of entries in a single transaction. + static constexpr size_t PAGE_SIZE = 100'000; + + const auto & disks = volume->getDisks(); + if (disks.empty()) + return std::nullopt; + + size_t total_restored = 0; + size_t total_restored_bytes = 0; + size_t total_stale = 0; + String scan_start_key; // empty = start from key_prefix + + while (true) + { + ReconcileBatch page; + page.reserve(PAGE_SIZE); + std::vector stale_fwd_keys; + size_t page_bytes = 0; + size_t page_count = 0; + String last_key; + + Catalog::IMetaStore::IteratorPtr it; + try { it = metastore->getByPrefix(key_prefix, PAGE_SIZE, DEFAULT_SCAN_BATCH_COUNT, scan_start_key); } + catch (...) { tryLogCurrentException(log, "TTLCacheFDBIndex: getByPrefix failed"); return std::nullopt; } + + while (it->next()) + { + last_key = it->key(); + page_count++; + + String seg_name; + size_t size{0}; + time_t part_ts{0}; + + if (!decodeValue(it->value(), seg_name, size, part_ts)) + { + LOG_WARNING(log, "TTLCacheFDBIndex reconcile: decode failed for key={} value={}", it->key(), it->value()); + stale_fwd_keys.push_back(last_key); + continue; + } + + if (!should_cache(part_ts)) + { + LOG_DEBUG(log, "TTLCacheFDBIndex reconcile: TTL expired for seg={} part_ts={}", seg_name, part_ts); + stale_fwd_keys.push_back(last_key); + continue; + } + + auto key = DiskCacheTTL::hash(seg_name); + auto rel_path = get_rel_path(key, seg_name); + + // TODO: multi-disk JBOD support — store disk name in FDB value so reconcile can + // assign the correct disk without a per-file exists() scan across all disks. + // For now assume single-disk volume (one PVC per pod) and trust FDB as authoritative, + // skipping the per-file exists() syscall (too costly at millions of entries). + page.emplace_back(key, std::make_shared( + DiskCacheTTLMeta::State::Cached, disks[0], size, time(nullptr), part_ts, rel_path.string())); + page_bytes += size; + } + + if (!page.empty()) + { + on_reconcile_batch(page); + DiskCacheFactory::instance().addGlobalTTLUsage(page_bytes); + if (on_stats_update) + { + for (const auto & [key, meta] : page) + on_stats_update(meta->max_timestamp, meta->size); + } + } + + if (!stale_fwd_keys.empty()) + { + try + { + Catalog::BatchCommitRequest batch; + for (const auto & fwd : stale_fwd_keys) + { + batch.AddDelete(Catalog::SingleDeleteRequest(fwd)); + String rev = rev_key_prefix + fwd.substr(key_prefix.size()); + batch.AddDelete(Catalog::SingleDeleteRequest(rev)); + } + Catalog::BatchCommitResponse resp; + metastore->batchWrite(batch, resp); + LOG_DEBUG(log, "TTLCacheFDBIndex reconcile: removed {} stale fwd+rev pairs", stale_fwd_keys.size()); + } + catch (...) { tryLogCurrentException(log, "TTLCacheFDBIndex: stale cleanup failed"); } + } + + total_restored += page.size(); + total_restored_bytes += page_bytes; + total_stale += stale_fwd_keys.size(); + + if (page_count < PAGE_SIZE) + break; + + // Advance past the last key seen ('\x00' suffix = next key in FDB ordering). + scan_start_key = last_key + '\x00'; + } + + LOG_INFO(log, "TTLCacheFDBIndex reconcile complete: {} entries restored, {} stale removed", total_restored, total_stale); + + if (total_restored == 0) + return std::nullopt; + return std::make_pair(total_restored, total_restored_bytes); +} + +} diff --git a/src/Storages/DiskCache/TTLCacheFDBIndex.h b/src/Storages/DiskCache/TTLCacheFDBIndex.h new file mode 100644 index 00000000000..e55a1b2fb47 --- /dev/null +++ b/src/Storages/DiskCache/TTLCacheFDBIndex.h @@ -0,0 +1,105 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace DB +{ + +class DiskCacheTTL; +class DiskCacheTTLMeta; + +/// FDB-backed index for DiskCacheTTL. +/// On set(): async-writes an entry so the in-memory cache_map can be restored from +/// FDB on the next startup instead of doing a slow disk scan. +/// On evictPart(): issues a single FDB clean() covering all segments of a part. +/// reconcile(): called from load() — scans FDB, verifies files on disk, populates cache_map. +class TTLCacheFDBIndex +{ +public: + TTLCacheFDBIndex( + std::shared_ptr metastore_, + const String & name_space, + const String & worker_id, + const String & table_uuid, + const String & own_endpoint_); + + ~TTLCacheFDBIndex(); + + /// Enqueue async FDB write after a segment is successfully cached. + void onSet(UInt128 key, const String & seg_name, size_t size, time_t part_ts); + + /// Issue FDB clean() for all segments of one part (hash_high). + /// partition_id: YYYYMMDD string derived from max_timestamp (same as path structure). + void evictPart(const String & partition_id, UInt64 hash_high); + + /// Look up whether any peer worker has this segment cached. + /// Returns peer RPC endpoint (host:port) if found, nullopt otherwise. + std::optional findPeerOwner(UInt128 key, const String & partition_id); + + /// Scan FDB index and restore cache_map. + /// Calls on_stats_update for each successfully restored entry so the + /// caller can update partition_stats without re-scanning cache_map + /// Returns {entries, bytes} restored, or nullopt if index is empty/unavailable. + using ReconcileBatch = std::vector>>; + + std::optional> reconcile( + const VolumePtr & volume, + std::function get_rel_path, + std::function should_cache, + std::function on_reconcile_batch, + std::function on_stats_update = nullptr); + +private: + struct PendingOp + { + enum class Type { Set, Evict } type; + String key; // full FDB key (Set) or prefix to clean (Evict) + String value; // serialized entry (Set only) + }; + + void bgLoop(); + void flush(std::vector & ops); + + String makeSegKey(UInt128 key, const String & partition_id) const; + String makePartPrefix(const String & partition_id, UInt64 hash_high) const; + + static String encodeValue(const String & seg_name, size_t size, time_t part_ts); + static bool decodeValue(const String & raw, String & seg_name, size_t & size, time_t & part_ts); + + String makeRevKey(UInt128 key, const String & partition_id) const; + String makeRevPartPrefix(const String & partition_id, UInt64 hash_high) const; + + std::shared_ptr metastore; + String key_prefix; // escapeString(ns) + "_DCI_" + escapeString(worker_id) + "_" + table_uuid + String rev_key_prefix; // escapeString(ns) + "_DCIREV_" + table_uuid + String own_worker_id; // stable worker identity (WORKER_ID env), stored in DCIREV_ values and used to skip self + + std::mutex mu; + std::deque queue; + std::condition_variable cv; + std::thread bg; + std::atomic stopped{false}; + + static constexpr size_t BATCH_SIZE = 100; + static constexpr size_t MAX_WAIT_MS = 5000; + + Poco::Logger * log; +}; + +} diff --git a/src/Storages/DiskCache/tests/gtest_disk_cache_key_test.cpp b/src/Storages/DiskCache/tests/gtest_disk_cache_key_test.cpp index 67c17a49f87..4091376e300 100644 --- a/src/Storages/DiskCache/tests/gtest_disk_cache_key_test.cpp +++ b/src/Storages/DiskCache/tests/gtest_disk_cache_key_test.cpp @@ -15,6 +15,7 @@ #include #include +#include #include #include @@ -45,4 +46,46 @@ TEST(DiskCache, DiskCachePathTest) EXPECT_NE(path1.filename(), path2.filename()); } +// TTL cache key tests +TEST(DiskCacheTTL, UnhexKeyTest) +{ + String table_uuid = UUIDHelpers::UUIDToString(UUIDHelpers::generateV4()); + String seg_key = IDiskCacheSegment::formatSegmentName(table_uuid, "20240315_1_100_2", "col", 0, ".bin"); + + DiskCacheTTL::KeyType key = DiskCacheTTL::hash(seg_key); + String hex_key = DiskCacheTTL::hexKey(key); + auto unhex = DiskCacheTTL::unhexKey(hex_key); + EXPECT_TRUE(unhex.has_value()); + EXPECT_EQ(unhex.value(), key); + + // Invalid hex keys + EXPECT_FALSE(DiskCacheTTL::unhexKey("invalid").has_value()); + EXPECT_FALSE(DiskCacheTTL::unhexKey("12345").has_value()); + EXPECT_FALSE(DiskCacheTTL::unhexKey("gggggggggggggggggggggggggggggggg").has_value()); +} + +TEST(DiskCacheTTL, PartitionHierarchyPathTest) +{ + String table_uuid = UUIDHelpers::UUIDToString(UUIDHelpers::generateV4()); + String seg_key1 = IDiskCacheSegment::formatSegmentName(table_uuid, "20240315_1_100_2", "col", 0, ".bin"); + String seg_key2 = IDiskCacheSegment::formatSegmentName(table_uuid, "20240315_1_100_2", "col", 0, ".mrk"); + String seg_key3 = IDiskCacheSegment::formatSegmentName(table_uuid, "20240316_1_100_2", "col", 0, ".bin"); + + auto key1 = DiskCacheTTL::hash(seg_key1); + auto key2 = DiskCacheTTL::hash(seg_key2); + auto key3 = DiskCacheTTL::hash(seg_key3); + + // Path structure doesn't need cache instance, just use hexKey to verify structure + String hex1 = DiskCacheTTL::hexKey(key1); + String hex2 = DiskCacheTTL::hexKey(key2); + String hex3 = DiskCacheTTL::hexKey(key3); + + // Verify same part -> same hash_high (first half of hex) + EXPECT_EQ(hex1.substr(16, 16), hex2.substr(16, 16)); // hash_high for same part + EXPECT_NE(hex1.substr(0, 16), hex2.substr(0, 16)); // hash_low differs (different segments) + + // Different partitions -> different hash_high + EXPECT_NE(hex1.substr(16, 16), hex3.substr(16, 16)); +} + } diff --git a/src/Storages/DiskCache/tests/gtest_disk_cache_ttl_test.cpp b/src/Storages/DiskCache/tests/gtest_disk_cache_ttl_test.cpp new file mode 100644 index 00000000000..d062e12a54a --- /dev/null +++ b/src/Storages/DiskCache/tests/gtest_disk_cache_ttl_test.cpp @@ -0,0 +1,1696 @@ +/* + * Copyright (2022) Bytedance Ltd. and/or its affiliates + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace fs = std::filesystem; + +namespace DB +{ + +class DiskCacheTTLTest : public ::testing::Test +{ +public: + static void SetUpTestCase() + { + Poco::AutoPtr formatter(new Poco::PatternFormatter("%Y.%m.%d %H:%M:%S.%F <%p> %s: %t")); + Poco::AutoPtr console_chanel(new Poco::ConsoleChannel); + Poco::AutoPtr channel(new Poco::FormattingChannel(formatter, console_chanel)); + Poco::Logger::root().setLevel("trace"); + Poco::Logger::root().setChannel(channel); + + ctx = getContext().context; + } + + static void TearDownTestCase() + { + ctx->shutdown(); + } + + void SetUp() override + { + fs::remove_all("tmp/"); + fs::create_directories("tmp/"); + fs::create_directory("tmp/ttl_cache/"); + UnitTest::initLogger(); + DB::IDiskCache::init(*getContext().context); + } + + void TearDown() override + { + fs::remove_all("tmp/"); + DB::IDiskCache::close(); + } + + VolumePtr createTestVolume() + { + fs::create_directory("tmp/ttl_disk/"); + auto disk = std::make_shared("ttl_disk", "tmp/ttl_disk/", DiskStats{}); + return std::make_shared("ttl_volume", std::move(disk), 0); + } + + VolumePtr createDualDiskVolume() + { + fs::create_directory("tmp/ttl_disk1/"); + fs::create_directory("tmp/ttl_disk2/"); + Disks disks; + disks.emplace_back(std::make_shared("ttl_disk1", "tmp/ttl_disk1/", DiskStats{})); + disks.emplace_back(std::make_shared("ttl_disk2", "tmp/ttl_disk2/", DiskStats{})); + return std::make_shared("ttl_dual_volume", disks, disks.front()->getName(), 0, false); + } + + static std::shared_ptr ctx; +}; + +std::shared_ptr DiskCacheTTLTest::ctx = nullptr; + +// Test parsing partition timestamps from part names +TEST_F(DiskCacheTTLTest, ParsePartitionTimestamp) +{ + // YYYYMMDD format (20240315) + { + String part_name = "20240315_1_100_2"; + time_t ts = DiskCacheTTL::parsePartitionTimestamp(part_name); + ASSERT_GT(ts, 0); + + struct tm tm_time; + gmtime_r(&ts, &tm_time); + ASSERT_EQ(tm_time.tm_year + 1900, 2024); + ASSERT_EQ(tm_time.tm_mon + 1, 3); + ASSERT_EQ(tm_time.tm_mday, 15); + } + + // YYYYMMDDHH format (2024031523) + { + String part_name = "2024031523_1_100_2"; + time_t ts = DiskCacheTTL::parsePartitionTimestamp(part_name); + ASSERT_GT(ts, 0); + + struct tm tm_time; + gmtime_r(&ts, &tm_time); + ASSERT_EQ(tm_time.tm_year + 1900, 2024); + ASSERT_EQ(tm_time.tm_mon + 1, 3); + ASSERT_EQ(tm_time.tm_mday, 15); + ASSERT_EQ(tm_time.tm_hour, 23); + } + + // YYYYMM format (202403) + { + String part_name = "202403_1_100_2"; + time_t ts = DiskCacheTTL::parsePartitionTimestamp(part_name); + ASSERT_GT(ts, 0); + + struct tm tm_time; + gmtime_r(&ts, &tm_time); + ASSERT_EQ(tm_time.tm_year + 1900, 2024); + ASSERT_EQ(tm_time.tm_mon + 1, 3); + ASSERT_EQ(tm_time.tm_mday, 1); + } + + // Non-time partition (string partition) + { + String part_name = "some_partition_1_100_2"; + time_t ts = DiskCacheTTL::parsePartitionTimestamp(part_name); + ASSERT_EQ(ts, 0); + } + + // Invalid format + { + String part_name = "999_1_100_2"; + time_t ts = DiskCacheTTL::parsePartitionTimestamp(part_name); + ASSERT_EQ(ts, 0); + } + + // Empty partition + { + String part_name = "_1_100_2"; + time_t ts = DiskCacheTTL::parsePartitionTimestamp(part_name); + ASSERT_EQ(ts, 0); + } +} + +// Test TTL behavior through set/get operations (tests shouldCache indirectly) +TEST_F(DiskCacheTTLTest, TTLBehaviorThroughOperations) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 1024 * 1024; + auto strategy = std::make_shared(settings); + + UInt64 ttl_minutes = 60; // 1 hour TTL + DiskCacheTTL cache("test_ttl", "test-uuid-0000-0000-0000-000000000001", volume, nullptr, settings, strategy, ttl_minutes, 0); + + time_t now = time(nullptr); + + // Recent partition (30 minutes old) - should cache + { + struct tm tm_time; + time_t recent_time = now - (30 * 60); + gmtime_r(&recent_time, &tm_time); + String part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_time.tm_year + 1900, tm_time.tm_mon + 1, tm_time.tm_mday); + String seg = fmt::format("test-uuid-0000-0000-0000-000000000001/{}/col.bin/offset_0", part); + + String data = "test"; + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + + auto [disk, path] = cache.get(seg); + ASSERT_FALSE(path.empty()); // Should be cached + } + + // Old partition (2 hours old) - should not cache + { + struct tm tm_time; + time_t old_time = now - (2 * 60 * 60); + gmtime_r(&old_time, &tm_time); + String part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_time.tm_year + 1900, tm_time.tm_mon + 1, tm_time.tm_mday); + String seg = fmt::format("test-uuid-0000-0000-0000-000000000001/{}/col.bin/offset_1", part); + + String data = "test"; + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + + auto [disk, path] = cache.get(seg); + ASSERT_TRUE(path.empty()); // Should NOT be cached + } +} + +// ttl_minutes=0 means "cache nothing" — all writes are rejected +TEST_F(DiskCacheTTLTest, TTLZeroRejectsAll) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 1024 * 1024; + auto strategy = std::make_shared(settings); + + DiskCacheTTL cache("test-cache", "test-uuid", volume, nullptr, settings, strategy, 0, 0); + + time_t now = time(nullptr); + struct tm tm_time; + gmtime_r(&now, &tm_time); + String part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_time.tm_year + 1900, tm_time.tm_mon + 1, tm_time.tm_mday); + String seg = fmt::format("test-uuid-0000-0000-0000-000000000002/{}/col.bin/offset_0", part); + + String data = "test"; + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + + auto [disk, path] = cache.get(seg); + ASSERT_TRUE(path.empty()); // ttl_minutes=0 rejects all writes + ASSERT_EQ(cache.getStats().rejected_too_old, 1u); +} + +// Test non-time partitions are rejected +TEST_F(DiskCacheTTLTest, RejectNonTimePartitions) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 1024 * 1024; + auto strategy = std::make_shared(settings); + + UInt64 ttl_minutes = 60; + DiskCacheTTL cache("test-cache", "test-uuid", volume, nullptr, settings, strategy, ttl_minutes, 0); + + // String partition (non-time) + String nontime_part = "string_partition_1_100_2"; + String nontime_seg = fmt::format("test_uuid/{}/column.bin/offset_0", nontime_part); + + // Numeric but invalid date partition + String invalid_part = "999_1_100_2"; + String invalid_seg = fmt::format("test_uuid/{}/column.bin/offset_1", invalid_part); + + // Try to cache non-time partitions - should be rejected + { + String data = "test data"; + ReadBufferFromString buf1(data); + ReadBufferFromString buf2(data); + cache.set(nontime_seg, buf1, data.size(), false); + cache.set(invalid_seg, buf2, data.size(), false); + + // Should not be cached + auto [disk1, path1] = cache.get(nontime_seg); + auto [disk2, path2] = cache.get(invalid_seg); + ASSERT_TRUE(path1.empty()); + ASSERT_TRUE(path2.empty()); + } + + ASSERT_EQ(cache.getKeyCount(), 0); +} + +// Test basic set/get operations with TTL filtering +TEST_F(DiskCacheTTLTest, BasicOperations) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 1024 * 1024; + auto strategy = std::make_shared(settings); + + UInt64 ttl_minutes = 60; // 1 hour TTL + DiskCacheTTL cache("test-cache", "test-uuid", volume, nullptr, settings, strategy, ttl_minutes, 0); + + time_t now = time(nullptr); + + // Create recent segment name (should be cached) + struct tm tm_recent; + gmtime_r(&now, &tm_recent); + String recent_part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_recent.tm_year + 1900, tm_recent.tm_mon + 1, tm_recent.tm_mday); + String recent_seg = fmt::format("test_uuid/{}/column.bin/offset_123", recent_part); + + // Create old segment name (should not be cached) + time_t old_time = now - (2 * 60 * 60); // 2 hours ago + struct tm tm_old; + gmtime_r(&old_time, &tm_old); + String old_part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_old.tm_year + 1900, tm_old.tm_mon + 1, tm_old.tm_mday); + String old_seg = fmt::format("test_uuid/{}/column.bin/offset_456", old_part); + + // Try to set recent segment - should succeed + { + String test_data = "test data content"; + ReadBufferFromString buffer(test_data); + cache.set(recent_seg, buffer, test_data.size(), false); + + auto [disk, path] = cache.get(recent_seg); + ASSERT_FALSE(path.empty()); + ASSERT_TRUE(disk != nullptr); + } + + // Try to set old segment - should be rejected (not cached due to TTL) + { + String test_data = "old data content"; + ReadBufferFromString buffer(test_data); + cache.set(old_seg, buffer, test_data.size(), false); + + auto [disk, path] = cache.get(old_seg); + ASSERT_TRUE(path.empty()); // Should not be cached + } +} + +// Test eviction of expired entries +TEST_F(DiskCacheTTLTest, EvictExpired) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 1024 * 1024; + auto strategy = std::make_shared(settings); + + UInt64 ttl_minutes = 60; // 1 hour TTL + DiskCacheTTL cache("test-cache", "test-uuid", volume, nullptr, settings, strategy, ttl_minutes, 0); + + time_t now = time(nullptr); + + // Create recent partition (30 minutes old - should survive) + struct tm tm_recent; + time_t recent_time = now - (30 * 60); + gmtime_r(&recent_time, &tm_recent); + String recent_part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_recent.tm_year + 1900, tm_recent.tm_mon + 1, tm_recent.tm_mday); + String recent_seg = fmt::format("test-uuid-0000-0000-0000-000000000005/{}/column.bin/offset_0", recent_part); + + // Create old partition (2 hours old - should be evicted) + struct tm tm_old; + time_t old_time = now - (2 * 60 * 60); + gmtime_r(&old_time, &tm_old); + String old_part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_old.tm_year + 1900, tm_old.tm_mon + 1, tm_old.tm_mday); + String old_seg = fmt::format("test-uuid-0000-0000-0000-000000000005/{}/column.bin/offset_1", old_part); + + // Add both segments + String data = "test data"; + ReadBufferFromString buf1(data); + ReadBufferFromString buf2(data); + cache.set(recent_seg, buf1, data.size(), false); + cache.set(old_seg, buf2, data.size(), false); + + // Verify both exist initially + size_t initial_count = cache.getKeyCount(); + ASSERT_EQ(initial_count, 2); + + // Wait a moment for potential background eviction + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Old partition should be evicted, recent should remain + ASSERT_EQ(cache.getKeyCount(), 1); + + auto [disk1, path1] = cache.get(recent_seg); + auto [disk2, path2] = cache.get(old_seg); + + ASSERT_FALSE(path1.empty()); // Recent still cached + ASSERT_TRUE(path2.empty()); // Old evicted +} + +// Periodic eviction is tested indirectly through EvictExpired test +// (eviction happens automatically every hour during get() operations) + +// Test concurrent set/get operations +TEST_F(DiskCacheTTLTest, ConcurrentAccess) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 10 * 1024 * 1024; + auto strategy = std::make_shared(settings); + + UInt64 ttl_minutes = 60; + DiskCacheTTL cache("test-cache", "test-uuid", volume, nullptr, settings, strategy, ttl_minutes, 0); + + time_t now = time(nullptr); + struct tm tm_now; + gmtime_r(&now, &tm_now); + String part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_now.tm_year + 1900, tm_now.tm_mon + 1, tm_now.tm_mday); + + std::vector threads; + std::atomic success_count{0}; + + // Multiple threads writing different segments + for (int i = 0; i < 10; i++) + { + threads.emplace_back([&, i]() { + String seg_name = fmt::format("test_uuid/{}/col.bin/offset_{}", part, i); + String data = fmt::format("data_{}", i); + ReadBufferFromString buffer(data); + cache.set(seg_name, buffer, data.size(), false); + + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + auto [disk, path] = cache.get(seg_name); + if (!path.empty()) + success_count++; + }); + } + + for (auto& t : threads) + t.join(); + + ASSERT_EQ(success_count, 10); + ASSERT_EQ(cache.getKeyCount(), 10); +} + +// Test drop() method removes part segments +TEST_F(DiskCacheTTLTest, DropPart) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 1024 * 1024; + auto strategy = std::make_shared(settings); + + UInt64 ttl_minutes = 60; + DiskCacheTTL cache("test-cache", "test-uuid", volume, nullptr, settings, strategy, ttl_minutes, 0); + + time_t now = time(nullptr); + struct tm tm_now; + gmtime_r(&now, &tm_now); + String part1 = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_now.tm_year + 1900, tm_now.tm_mon + 1, tm_now.tm_mday); + String part2 = fmt::format("{:04d}{:02d}{:02d}_2_200_2", + tm_now.tm_year + 1900, tm_now.tm_mon + 1, tm_now.tm_mday); + + // Add segments for two parts + for (int i = 0; i < 3; i++) + { + String seg1 = fmt::format("test_uuid/{}/col.bin/offset_{}", part1, i); + String seg2 = fmt::format("test_uuid/{}/col.bin/offset_{}", part2, i); + + String data = "test data"; + ReadBufferFromString buf1(data); + ReadBufferFromString buf2(data); + cache.set(seg1, buf1, data.size(), false); + cache.set(seg2, buf2, data.size(), false); + } + + size_t initial_count = cache.getKeyCount(); + ASSERT_EQ(initial_count, 6); + + // Drop part1 — path must include the UUID prefix used in segment names + cache.drop("test_uuid/" + part1); + ASSERT_EQ(cache.getKeyCount(), 3); + + // Verify part1 gone, part2 remains + String seg1_check = fmt::format("test_uuid/{}/col.bin/offset_0", part1); + String seg2_check = fmt::format("test_uuid/{}/col.bin/offset_0", part2); + + auto [disk1, path1] = cache.get(seg1_check); + auto [disk2, path2] = cache.get(seg2_check); + + ASSERT_TRUE(path1.empty()); + ASSERT_FALSE(path2.empty()); +} + +// Test cache stats +TEST_F(DiskCacheTTLTest, CacheStats) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 1024 * 1024; + auto strategy = std::make_shared(settings); + + UInt64 ttl_minutes = 60; + DiskCacheTTL cache("test-cache", "test-uuid", volume, nullptr, settings, strategy, ttl_minutes, 0); + + ASSERT_EQ(cache.getKeyCount(), 0); + ASSERT_EQ(cache.getCachedSize(), 0); + + time_t now = time(nullptr); + struct tm tm_now; + gmtime_r(&now, &tm_now); + String part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_now.tm_year + 1900, tm_now.tm_mon + 1, tm_now.tm_mday); + + // Add entries + for (int i = 0; i < 5; i++) + { + String seg_name = fmt::format("test_uuid/{}/col.bin/offset_{}", part, i); + String data = String(100, 'a'); + ReadBufferFromString buffer(data); + cache.set(seg_name, buffer, data.size(), false); + } + + ASSERT_EQ(cache.getKeyCount(), 5); + ASSERT_GT(cache.getCachedSize(), 0); +} + +// Test multi-disk volume +TEST_F(DiskCacheTTLTest, MultiDiskVolume) +{ + auto volume = createDualDiskVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 1024 * 1024; + auto strategy = std::make_shared(settings); + + UInt64 ttl_minutes = 60; + DiskCacheTTL cache("test-cache", "test-uuid", volume, nullptr, settings, strategy, ttl_minutes, 0); + + time_t now = time(nullptr); + struct tm tm_now; + gmtime_r(&now, &tm_now); + String part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_now.tm_year + 1900, tm_now.tm_mon + 1, tm_now.tm_mday); + + // Add multiple segments to trigger distribution across disks + for (int i = 0; i < 10; i++) + { + String seg_name = fmt::format("test_uuid/{}/col.bin/offset_{}", part, i); + String data = String(1000, 'a'); + ReadBufferFromString buffer(data); + cache.set(seg_name, buffer, data.size(), false); + } + + ASSERT_EQ(cache.getKeyCount(), 10); + + // Verify all can be retrieved + for (int i = 0; i < 10; i++) + { + String seg_name = fmt::format("test_uuid/{}/col.bin/offset_{}", part, i); + auto [disk, path] = cache.get(seg_name); + ASSERT_FALSE(path.empty()); + ASSERT_TRUE(disk != nullptr); + } +} + +// Test detailed statistics collection +TEST_F(DiskCacheTTLTest, DetailedStats) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 10 * 1024 * 1024; + auto strategy = std::make_shared(settings); + + UInt64 ttl_minutes = 60; + DiskCacheTTL cache("test-cache", "test-uuid", volume, nullptr, settings, strategy, ttl_minutes, 0); + + time_t now = time(nullptr); + + // Add recent entries (should be cached) + struct tm tm_recent; + time_t recent_time = now - (30 * 60); + gmtime_r(&recent_time, &tm_recent); + String recent_part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_recent.tm_year + 1900, tm_recent.tm_mon + 1, tm_recent.tm_mday); + + for (int i = 0; i < 5; i++) + { + String seg = fmt::format("test-uuid-0000-0000-0000-00000000000b/{}/col.bin/offset_{}", recent_part, i); + String data = String(100, 'a'); + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + } + + // Try to add old entries (should be rejected) + struct tm tm_old; + time_t old_time = now - (2 * 60 * 60); + gmtime_r(&old_time, &tm_old); + String old_part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_old.tm_year + 1900, tm_old.tm_mon + 1, tm_old.tm_mday); + + for (int i = 0; i < 3; i++) + { + String seg = fmt::format("test-uuid-0000-0000-0000-00000000000b/{}/col.bin/offset_{}", old_part, i); + String data = String(100, 'a'); + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + } + + // Try to add non-time partition (should be rejected) + String nontime_part = "string_partition_1_100_2"; + for (int i = 0; i < 2; i++) + { + String seg = fmt::format("test-uuid-0000-0000-0000-00000000000b/{}/col.bin/offset_{}", nontime_part, i); + String data = String(100, 'a'); + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + } + + // Check global stats + auto stats = cache.getStats(); + ASSERT_EQ(stats.table_uuid, "test-uuid-0000-0000-0000-00000000000b"); + ASSERT_EQ(stats.total_entries, 5); // Only recent entries + ASSERT_GT(stats.total_bytes, 0); + ASSERT_EQ(stats.rejected_too_old, 3); // Old entries rejected + ASSERT_EQ(stats.rejected_non_time_partition, 2); // Non-time rejected + + // Perform gets (hits) + for (int i = 0; i < 5; i++) + { + String seg = fmt::format("test-uuid-0000-0000-0000-00000000000b/{}/col.bin/offset_{}", recent_part, i); + auto [disk, path] = cache.get(seg); + ASSERT_FALSE(path.empty()); + } + + // Perform gets (misses) + for (int i = 0; i < 3; i++) + { + String seg = fmt::format("test-uuid-0000-0000-0000-00000000000b/{}/col.bin/offset_{}", old_part, i); + auto [disk, path] = cache.get(seg); + ASSERT_TRUE(path.empty()); + } + + // Check partition stats + auto partition_stats = cache.getPartitionStats(); + ASSERT_GE(partition_stats.size(), 1); + + // Find recent partition stats + String recent_partition_id = fmt::format("{:04d}{:02d}{:02d}", + tm_recent.tm_year + 1900, tm_recent.tm_mon + 1, tm_recent.tm_mday); + + bool found_recent = false; + for (const auto & ps : partition_stats) + { + if (ps.partition_id == recent_partition_id) + { + found_recent = true; + ASSERT_EQ(ps.entry_count, 5); + ASSERT_GT(ps.total_bytes, 0); + break; + } + } + ASSERT_TRUE(found_recent); +} + +// Test stats after eviction +TEST_F(DiskCacheTTLTest, StatsAfterEviction) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 1024 * 1024; + auto strategy = std::make_shared(settings); + + UInt64 ttl_minutes = 60; + DiskCacheTTL cache("test-cache", "test-uuid", volume, nullptr, settings, strategy, ttl_minutes, 0); + + time_t now = time(nullptr); + + // Add recent entries + struct tm tm_recent; + time_t recent_time = now - (30 * 60); + gmtime_r(&recent_time, &tm_recent); + String recent_part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_recent.tm_year + 1900, tm_recent.tm_mon + 1, tm_recent.tm_mday); + + for (int i = 0; i < 3; i++) + { + String seg = fmt::format("test-uuid-0000-0000-0000-00000000000c/{}/col.bin/offset_{}", recent_part, i); + String data = "test data"; + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + } + + // Add old entries (will be cached initially but evicted later) + struct tm tm_old; + time_t old_time = now - (2 * 60 * 60); + gmtime_r(&old_time, &tm_old); + String old_part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_old.tm_year + 1900, tm_old.tm_mon + 1, tm_old.tm_mday); + + for (int i = 0; i < 2; i++) + { + String seg = fmt::format("test-uuid-0000-0000-0000-00000000000c/{}/col.bin/offset_{}", old_part, i); + String data = "test data"; + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + } + + auto stats_before = cache.getStats(); + size_t entries_before = stats_before.total_entries; + + // Wait for eviction + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + auto stats_after = cache.getStats(); + + // Old entries should be evicted + ASSERT_LT(stats_after.total_entries, entries_before); + ASSERT_GT(stats_after.evicted_expired, 0); + ASSERT_GT(stats_after.last_eviction_run, 0); +} + +// Test per-partition hit rate calculation +TEST_F(DiskCacheTTLTest, PartitionHitRate) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 1024 * 1024; + auto strategy = std::make_shared(settings); + + UInt64 ttl_minutes = 60; + DiskCacheTTL cache("test-cache", "test-uuid", volume, nullptr, settings, strategy, ttl_minutes, 0); + + time_t now = time(nullptr); + struct tm tm_now; + gmtime_r(&now, &tm_now); + String part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_now.tm_year + 1900, tm_now.tm_mon + 1, tm_now.tm_mday); + String partition_id = fmt::format("{:04d}{:02d}{:02d}", + tm_now.tm_year + 1900, tm_now.tm_mon + 1, tm_now.tm_mday); + + // Add 10 segments + for (int i = 0; i < 10; i++) + { + String seg = fmt::format("test-uuid-0000-0000-0000-00000000000d/{}/col.bin/offset_{}", part, i); + String data = "test"; + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + } + + // Hit 7 segments, miss 3 + for (int i = 0; i < 7; i++) + { + String seg = fmt::format("test-uuid-0000-0000-0000-00000000000d/{}/col.bin/offset_{}", part, i); + auto [disk, path] = cache.get(seg); + ASSERT_FALSE(path.empty()); + } + + for (int i = 10; i < 13; i++) // Non-existent segments + { + String seg = fmt::format("test-uuid-0000-0000-0000-00000000000d/{}/col.bin/offset_{}", part, i); + auto [disk, path] = cache.get(seg); + ASSERT_TRUE(path.empty()); + } + + auto partition_stats = cache.getPartitionStats(); + bool found = false; + for (const auto & ps : partition_stats) + { + if (ps.partition_id == partition_id) + { + found = true; + ASSERT_GT(ps.entry_count, 0); + ASSERT_GT(ps.total_bytes, 0); + break; + } + } + ASSERT_TRUE(found); +} + +// Test async size-based eviction +TEST_F(DiskCacheTTLTest, AsyncSizeBasedEviction) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 1024 * 1024; // 1MB limit + auto strategy = std::make_shared(settings); + + UInt64 ttl_minutes = 60; + DiskCacheTTL cache("test_async_eviction", "test-uuid-0000-0000-0000-00000000000e", volume, nullptr, settings, strategy, ttl_minutes, 1024 * 1024); + + time_t now = time(nullptr); + struct tm tm_now; + gmtime_r(&now, &tm_now); + + // Fill cache to ~95% (trigger async eviction threshold of 90%) + size_t segment_size = 100 * 1024; // 100KB per segment + int segments_to_add = 10; // 1MB total + + for (int i = 0; i < segments_to_add; i++) + { + String part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_now.tm_year + 1900, tm_now.tm_mon + 1, tm_now.tm_mday); + String seg = fmt::format("test-uuid-0000-0000-0000-00000000000e/{}/col.bin/offset_{}", part, i); + + String data = String(segment_size, 'a'); + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + } + + auto stats_before = cache.getStats(); + ASSERT_GT(stats_before.total_bytes, settings.ttl_cache_max_size * 0.90); + ASSERT_EQ(stats_before.async_eviction_triggered, 0); + + // Add one more segment - should trigger async eviction + { + String part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_now.tm_year + 1900, tm_now.tm_mon + 1, tm_now.tm_mday); + String seg = fmt::format("test-uuid-0000-0000-0000-00000000000e/{}/col.bin/offset_trigger", part); + + String data = String(segment_size, 'a'); + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + } + + // Check that async eviction was triggered + auto stats_after = cache.getStats(); + ASSERT_EQ(stats_after.async_eviction_triggered, 1); + + // Wait for async eviction to complete + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + + // Verify some space was freed + auto stats_final = cache.getStats(); + ASSERT_GT(stats_final.evicted_size_limit, 0); + + // Try adding another segment immediately - should be rate limited + { + String part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_now.tm_year + 1900, tm_now.tm_mon + 1, tm_now.tm_mday); + String seg = fmt::format("test-uuid-0000-0000-0000-00000000000e/{}/col.bin/offset_rate_limit", part); + + String data = String(segment_size, 'a'); + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + } + + // Should be rate limited (still 1 trigger, but skipped counter increased) + auto stats_rate_limit = cache.getStats(); + ASSERT_EQ(stats_rate_limit.async_eviction_triggered, 1); + ASSERT_GT(stats_rate_limit.async_eviction_skipped_rate_limit, 0); +} + +// Test explicit min/max time parameters override partition_id parsing +TEST_F(DiskCacheTTLTest, ExplicitTimestamps) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 1024 * 1024; + auto strategy = std::make_shared(settings); + + UInt64 ttl_minutes = 60; // 1 hour TTL + DiskCacheTTL cache("test_ttl", "test-uuid-0000-0000-0000-000000000010", volume, nullptr, settings, strategy, ttl_minutes, 0); + + time_t now = time(nullptr); + struct tm tm_now; + gmtime_r(&now, &tm_now); + + // Use old partition_id (2 hours ago) that would be rejected by partition parsing + time_t old_time = now - (2 * 60 * 60); + struct tm tm_old; + gmtime_r(&old_time, &tm_old); + String old_part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_old.tm_year + 1900, tm_old.tm_mon + 1, tm_old.tm_mday); + + // Test 1: Without explicit timestamps - should be rejected (partition is old) + { + String seg = fmt::format("test-uuid-0000-0000-0000-000000000010/{}/col.bin/offset_0", old_part); + String data = "test1"; + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + + auto [disk, path] = cache.get(seg); + ASSERT_TRUE(path.empty()); // Should NOT be cached (old partition) + } + + // Test 2: With explicit max_time (recent) - should be cached despite old partition_id + { + String seg = fmt::format("test-uuid-0000-0000-0000-000000000010/{}/col.bin/offset_1", old_part); + String data = "test2"; + ReadBufferFromString buf(data); + + time_t recent_max_time = now - (30 * 60); // 30 minutes ago (within TTL) + cache.set(seg, buf, data.size(), false, 0, recent_max_time); + + auto [disk, path] = cache.get(seg); + ASSERT_FALSE(path.empty()); // Should BE cached (explicit max_time is recent) + } + + // Test 3: With explicit max_time (old) - should be rejected + { + String seg = fmt::format("test-uuid-0000-0000-0000-000000000010/{}/col.bin/offset_2", old_part); + String data = "test3"; + ReadBufferFromString buf(data); + + time_t old_max_time = now - (90 * 60); // 90 minutes ago (outside TTL) + cache.set(seg, buf, data.size(), false, 0, old_max_time); + + auto [disk, path] = cache.get(seg); + ASSERT_TRUE(path.empty()); // Should NOT be cached (explicit max_time is old) + } +} + +// Test preload vs query stats tracking +TEST_F(DiskCacheTTLTest, PreloadQueryStats) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 1024 * 1024; + auto strategy = std::make_shared(settings); + + UInt64 ttl_minutes = 60; // 1 hour TTL + DiskCacheTTL cache("test_ttl", "test-uuid-0000-0000-0000-000000000011", volume, nullptr, settings, strategy, ttl_minutes, 0); + + time_t now = time(nullptr); + struct tm tm_now; + gmtime_r(&now, &tm_now); + String part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_now.tm_year + 1900, tm_now.tm_mon + 1, tm_now.tm_mday); + + // Cache with preload=false (query-triggered) + { + String seg = fmt::format("test-uuid-0000-0000-0000-000000000011/{}/col.bin/offset_0", part); + String data = String(100, 'a'); + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); // is_preload=false + + auto stats = cache.getStats(); + ASSERT_EQ(stats.cached_from_query, 1); + ASSERT_EQ(stats.cached_bytes_query, 100); + ASSERT_EQ(stats.cached_from_preload, 0); + ASSERT_EQ(stats.cached_bytes_preload, 0); + } + + // Cache with preload=true (background preload) + { + String seg = fmt::format("test-uuid-0000-0000-0000-000000000011/{}/col.bin/offset_1", part); + String data = String(200, 'b'); + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), true); // is_preload=true + + auto stats = cache.getStats(); + ASSERT_EQ(stats.cached_from_query, 1); + ASSERT_EQ(stats.cached_bytes_query, 100); + ASSERT_EQ(stats.cached_from_preload, 1); + ASSERT_EQ(stats.cached_bytes_preload, 200); + } + + // Cache more query-triggered segments + { + String seg = fmt::format("test-uuid-0000-0000-0000-000000000011/{}/col.bin/offset_2", part); + String data = String(50, 'c'); + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); // is_preload=false + + auto stats = cache.getStats(); + ASSERT_EQ(stats.cached_from_query, 2); + ASSERT_EQ(stats.cached_bytes_query, 150); + ASSERT_EQ(stats.cached_from_preload, 1); + ASSERT_EQ(stats.cached_bytes_preload, 200); + } +} + +// Test unlimited per-table cache (constrained only by global limit) +TEST_F(DiskCacheTTLTest, UnlimitedPerTable) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 0; // No worker-level per-table default + auto strategy = std::make_shared(settings); + + UInt64 ttl_minutes = 60; + + // Pass max_size_bytes=0 → unlimited per-table (constrained by global) + DiskCacheTTL cache("test_unlimited", "test-uuid-0000-0000-0000-000000000012", volume, nullptr, settings, strategy, ttl_minutes, 0); + + time_t now = time(nullptr); + struct tm tm_now; + gmtime_r(&now, &tm_now); + String part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_now.tm_year + 1900, tm_now.tm_mon + 1, tm_now.tm_mday); + + // Cache some data - no per-table limit check + for (int i = 0; i < 5; i++) + { + String seg = fmt::format("test-uuid-0000-0000-0000-000000000012/{}/col.bin/offset_{}", part, i); + String data = String(1024, 'x'); + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + } + + // Verify cached (no per-table eviction triggered) + auto stats = cache.getStats(); + ASSERT_EQ(stats.async_eviction_triggered, 0); // No local eviction + ASSERT_EQ(stats.total_entries, 5); +} + +// Test 2-tier precedence: per-table max_size_bytes > worker ttl_cache_max_size > unlimited (0) +TEST_F(DiskCacheTTLTest, SizeLimitPrecedence) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 10 * 1024 * 1024; // 10MB worker-level limit + settings.ttl_cache_max_percent = 80; // Would be larger than 10MB + auto strategy = std::make_shared(settings); + + UInt64 ttl_minutes = 60; + + // Test 1: Per-table limit (1MB) overrides worker-level (10MB) + { + DiskCacheTTL cache("test_per_table", "test-uuid-0000-0000-0000-000000000013", + volume, nullptr, settings, strategy, ttl_minutes, 1024 * 1024); + + time_t now = time(nullptr); + struct tm tm_now; + gmtime_r(&now, &tm_now); + String part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_now.tm_year + 1900, tm_now.tm_mon + 1, tm_now.tm_mday); + + // Fill to 95% of 1MB (should trigger at 90%) + size_t segment_size = 100 * 1024; // 100KB per segment + for (int i = 0; i < 10; i++) + { + String seg = fmt::format("test-uuid-0000-0000-0000-000000000013/{}/col.bin/offset_{}", part, i); + String data = String(segment_size, 'a'); + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + } + + // Should trigger eviction at 1MB limit, not 10MB + auto stats = cache.getStats(); + ASSERT_GT(stats.async_eviction_triggered, 0); + } + + // Test 2: Worker-level limit (10MB) used when per-table = 0 + { + DiskCacheTTL cache("test_worker_level", "test-uuid-0000-0000-0000-000000000014", + volume, nullptr, settings, strategy, ttl_minutes, 0); + + time_t now = time(nullptr); + struct tm tm_now; + gmtime_r(&now, &tm_now); + String part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_now.tm_year + 1900, tm_now.tm_mon + 1, tm_now.tm_mday); + + // Fill to 95% of 10MB + size_t segment_size = 1024 * 1024; // 1MB per segment + for (int i = 0; i < 10; i++) + { + String seg = fmt::format("test-uuid-0000-0000-0000-000000000014/{}/col.bin/offset_{}", part, i); + String data = String(segment_size, 'b'); + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + } + + // Should trigger eviction at 10MB limit + auto stats = cache.getStats(); + ASSERT_GT(stats.async_eviction_triggered, 0); + } + + // Test 3: Unlimited when both per-table and worker-level = 0 + { + DiskCacheSettings settings_no_limit; + settings_no_limit.ttl_cache_max_size = 0; + auto strategy_no_limit = std::make_shared(settings_no_limit); + + DiskCacheTTL cache("test_unlimited", "test-uuid-0000-0000-0000-000000000015", + volume, nullptr, settings_no_limit, strategy_no_limit, ttl_minutes, 0); + + time_t now = time(nullptr); + struct tm tm_now; + gmtime_r(&now, &tm_now); + String part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", + tm_now.tm_year + 1900, tm_now.tm_mon + 1, tm_now.tm_mday); + + // Cache data - no per-table limit + for (int i = 0; i < 5; i++) + { + String seg = fmt::format("test-uuid-0000-0000-0000-000000000015/{}/col.bin/offset_{}", part, i); + String data = String(1024, 'c'); + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + } + + // No per-table eviction (unlimited, only constrained by global) + auto stats = cache.getStats(); + ASSERT_EQ(stats.async_eviction_triggered, 0); + ASSERT_EQ(stats.total_entries, 5); + } +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +static String makeSegKey(const String & uuid, const String & part, const String & col, const String & ext) +{ + return fmt::format("{}/{}/{}#0{}", uuid, part, col, ext); +} + +static String todayPart() +{ + time_t now = time(nullptr); + struct tm t; + gmtime_r(&now, &t); + return fmt::format("{:04d}{:02d}{:02d}_1_100_2", t.tm_year + 1900, t.tm_mon + 1, t.tm_mday); +} + +static String expiredPart() +{ + time_t ts = time(nullptr) - 2 * 24 * 3600; + struct tm t; + gmtime_r(&ts, &t); + return fmt::format("{:04d}{:02d}{:02d}_1_100_2", t.tm_year + 1900, t.tm_mon + 1, t.tm_mday); +} + +// --------------------------------------------------------------------------- +// In-memory IMetaStore mock +// --------------------------------------------------------------------------- + +class MockMetaStore : public Catalog::IMetaStore +{ +public: + struct MockIterator : public Iterator + { + std::vector> entries; + int pos = -1; + bool next() override { return ++pos < static_cast(entries.size()); } + String key() override { return entries[pos].first; } + String value() override { return entries[pos].second; } + }; + + void put(const String & key, const String & value, bool = false) override { store[key] = value; } + std::pair putCAS(const String &, const String &, const String &, bool) override { return {false, {}}; } + uint64_t get(const String & key, String & value) override + { + auto it = store.find(key); + if (it == store.end()) return 0; + value = it->second; + return 1; + } + std::vector> multiGet(const std::vector &) override { return {}; } + bool batchWrite(const Catalog::BatchCommitRequest &, Catalog::BatchCommitResponse &) override { return true; } + void drop(const String & key, const UInt64 &) override { store.erase(key); } + void drop(const String & key, const String &) override { store.erase(key); } + IteratorPtr getAll() override { return getByPrefix(""); } + IteratorPtr getByPrefix(const String & prefix, const size_t & limit = 0, uint32_t = 0, const String & start_key = "") override + { + auto iter = std::make_shared(); + for (auto & [k, v] : store) + { + if (!k.starts_with(prefix)) + continue; + // start_key is inclusive (FIRST_GREATER_OR_EQUAL semantics for first batch) + if (!start_key.empty() && k < start_key) + continue; + iter->entries.emplace_back(k, v); + if (limit > 0 && iter->entries.size() >= limit) + break; + } + return iter; + } + IteratorPtr getByRange(const String &, const String &, bool, bool) override { return std::make_shared(); } + void clean(const String & prefix) override + { + for (auto it = store.begin(); it != store.end(); ) + it = it->first.starts_with(prefix) ? store.erase(it) : std::next(it); + } + void close() override {} + uint32_t getMaxBatchSize() override { return 1000; } + uint32_t getMaxKVSize() override { return 1024 * 1024; } + + std::map store; +}; + +// --------------------------------------------------------------------------- +// Parameterized: set / get / evict for .bin, .mrk, .idx +// --------------------------------------------------------------------------- + +struct SegCase { const char * ext; const char * expected_prefix; }; + +class SegmentPrefixTest : public DiskCacheTTLTest, + public ::testing::WithParamInterface {}; + +INSTANTIATE_TEST_SUITE_P(AllTypes, SegmentPrefixTest, ::testing::Values( + SegCase{".bin", "data/"}, + SegCase{".mrk", "meta/"}, + SegCase{".idx", "meta/"} +)); + +TEST_P(SegmentPrefixTest, SetGoesToCorrectDir) +{ + auto p = GetParam(); + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 64 * 1024 * 1024; + auto strategy = std::make_shared(settings); + DiskCacheTTL cache("test-cache", "test-uuid", volume, nullptr, settings, strategy, 60, 0); + + String seg = makeSegKey("aaaa-bbbb", todayPart(), "col", p.ext); + String data = "payload"; + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + + auto [disk, path] = cache.get(seg); + ASSERT_FALSE(path.empty()) << "segment not found after set: " << seg; + EXPECT_NE(path.find(p.expected_prefix), String::npos) + << p.ext << " should be under " << p.expected_prefix << " but path=" << path; + // Also verify the file actually exists at the returned path + ASSERT_TRUE(disk); + EXPECT_TRUE(disk->exists(path)) << "file missing on disk at: " << path; +} + +TEST_P(SegmentPrefixTest, GetReturnsExistingFile) +{ + auto p = GetParam(); + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 64 * 1024 * 1024; + auto strategy = std::make_shared(settings); + DiskCacheTTL cache("test-cache", "test-uuid", volume, nullptr, settings, strategy, 60, 0); + + String seg = makeSegKey("aaaa-bbbb", todayPart(), "col", p.ext); + String data = "payload"; + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + + // get() must return a path that actually contains the prefix and the file + auto [disk, path] = cache.get(seg); + ASSERT_TRUE(disk) << "no disk for " << seg; + EXPECT_TRUE(disk->exists(path)) << "file not on disk: " << path; + EXPECT_NE(path.find(p.expected_prefix), String::npos) + << p.ext << " get() returned wrong prefix: " << path; +} + +TEST_P(SegmentPrefixTest, EvictRemovesFromDisk) +{ + auto p = GetParam(); + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 64 * 1024 * 1024; + auto strategy = std::make_shared(settings); + // 1-minute TTL — 2-day-old part is expired + DiskCacheTTL cache("test-cache", "test-uuid", volume, nullptr, settings, strategy, 1, 0); + + time_t old_ts = time(nullptr) - 2 * 24 * 3600; + String seg = makeSegKey("aaaa-bbbb", expiredPart(), "col", p.ext); + String data = "payload"; + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false, 0, old_ts); + + ASSERT_EQ(cache.getKeyCount(), 1); + auto [disk, path] = cache.get(seg); + ASSERT_TRUE(disk && disk->exists(path)) << "file should exist before eviction: " << path; + + cache.evictExpired(); + + EXPECT_EQ(cache.getKeyCount(), 0); + EXPECT_FALSE(disk->exists(path)) + << p.ext << " file still on disk after eviction — rel_path prefix bug? path=" << path; +} + +// --------------------------------------------------------------------------- +// Reconcile: FDB entries for all three types restore with correct rel_path +// --------------------------------------------------------------------------- + +TEST_F(DiskCacheTTLTest, ReconcileRestoresAllTypesWithCorrectRelPath) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 64 * 1024 * 1024; + auto strategy = std::make_shared(settings); + DiskCacheTTL cache("test-cache", "test-uuid", volume, nullptr, settings, strategy, 60, 0); + + const String uuid = "aaaa-bbbb-cccc-dddd"; + const String part = todayPart(); + const time_t now = time(nullptr); + + struct SegInfo { String ext; String expected_prefix; }; + SegInfo cases[] = {{".bin", "data/"}, {".mrk", "meta/"}, {".idx", "meta/"}}; + + // Write all three types to disk so reconcile can verify file existence + std::map seg_to_path; + for (auto & c : cases) + { + String seg = makeSegKey(uuid, part, "col", c.ext); + String data = "payload"; + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false, 0, now); + auto [disk, path] = cache.get(seg); + ASSERT_FALSE(path.empty()) << "failed to cache: " << seg; + EXPECT_NE(path.find(c.expected_prefix), String::npos) + << "wrong write prefix for " << c.ext << ": " << path; + seg_to_path[seg] = path; + } + + // Build mock FDB store — key_prefix = "{ns}_DCI_{worker}_{uuid}" + // Seed one entry per segment using encodeValue; the key just needs the prefix. + const String ns = "byconity"; + const String worker = "test-worker"; + const String key_prefix = fmt::format("{}_DCI_{}_{}", ns, worker, uuid); + auto mock_store = std::make_shared(); + int i = 0; + for (auto & [seg, path] : seg_to_path) + { + String fdb_key = fmt::format("{}_{:04d}", key_prefix, i++); + mock_store->store[fdb_key] = fmt::format("{}:{}:{}", static_cast(now), 7, seg); + } + + // Reconcile into a fresh cache_map + TTLCacheFDBIndex fdb_idx(mock_store, ns, worker, uuid, worker); + std::map> cache_map; + auto get_rel_path = [&cache](UInt128 key, const String & seg_name) -> std::filesystem::path + { + return cache.getRelativePath(key, seg_name); + }; + + fdb_idx.reconcile( + volume, + get_rel_path, + [](time_t) { return true; }, + [&cache_map](TTLCacheFDBIndex::ReconcileBatch & batch) { + for (auto & [key, meta] : batch) + cache_map[key] = meta; + } + ); + + ASSERT_EQ(cache_map.size(), 3u) << "expected 3 entries restored"; + + for (auto & [seg, expected_path] : seg_to_path) + { + auto key = DiskCacheTTL::hash(seg); + auto it = cache_map.find(key); + ASSERT_NE(it, cache_map.end()) << "segment not restored: " << seg; + EXPECT_EQ(it->second->rel_path, expected_path) + << "rel_path mismatch for " << seg + << "\n got: " << it->second->rel_path + << "\n want: " << expected_path; + } +} + +// Verify drop() decrements partition_stats correctly +TEST_F(DiskCacheTTLTest, DropUpdatesPartitionStats) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 1024 * 1024; + auto strategy = std::make_shared(settings); + + DiskCacheTTL cache("test_drop_pstats", "test-uuid-drop", volume, nullptr, settings, strategy, 60 * 24 * 365, 0); + + time_t now = time(nullptr); + struct tm tm; + gmtime_r(&now, &tm); + String part1 = fmt::format("{:04d}{:02d}{:02d}_1_100_2", tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday); + String part2 = fmt::format("{:04d}{:02d}{:02d}_2_200_2", tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday); + String partition_id = fmt::format("{:04d}{:02d}{:02d}", tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday); + const String uuid = "test-uuid-drop"; + + for (int i = 0; i < 3; i++) + { + String seg = fmt::format("{}/{}/col.bin/offset_{}", uuid, part1, i); + String data(100, 'a'); + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + } + for (int i = 0; i < 2; i++) + { + String seg = fmt::format("{}/{}/col.bin/offset_{}", uuid, part2, i); + String data(100, 'b'); + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + } + ASSERT_EQ(cache.getKeyCount(), 5); + + { + auto pstats = cache.getPartitionStats(); + bool found = false; + for (const auto & ps : pstats) + { + if (ps.partition_id == partition_id) + { + found = true; + ASSERT_EQ(ps.entry_count, 5u); + ASSERT_EQ(ps.total_bytes, 500u); + } + } + ASSERT_TRUE(found) << "partition not found before drop: " << partition_id; + } + + cache.drop(uuid + "/" + part1); + + ASSERT_EQ(cache.getKeyCount(), 2); + ASSERT_EQ(cache.getCachedSize(), 200u); + { + auto gstats = cache.getStats(); + ASSERT_EQ(gstats.total_entries, 2u); + ASSERT_EQ(gstats.total_bytes, 200u); + } + + { + auto pstats = cache.getPartitionStats(); + bool found = false; + for (const auto & ps : pstats) + { + if (ps.partition_id == partition_id) + { + found = true; + ASSERT_EQ(ps.entry_count, 2u); + ASSERT_EQ(ps.total_bytes, 200u); + } + } + ASSERT_TRUE(found) << "partition not found after drop: " << partition_id; + } +} + +// Verify evictExpired() is a no-op on fresh entries and leaves partition_stats intact +TEST_F(DiskCacheTTLTest, EvictExpiredNoOpKeepsPartitionStats) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 1024 * 1024; + auto strategy = std::make_shared(settings); + + UInt64 ttl_minutes = 60; + DiskCacheTTL cache("test_evict_noop", "test-uuid-evict", volume, nullptr, settings, strategy, ttl_minutes, 0); + + time_t now = time(nullptr); + struct tm tm; + gmtime_r(&now, &tm); + String part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday); + String partition_id = fmt::format("{:04d}{:02d}{:02d}", tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday); + const String uuid = "test-uuid-evict"; + + for (int i = 0; i < 4; i++) + { + String seg = fmt::format("{}/{}/col.bin/offset_{}", uuid, part, i); + String data(100, 'a'); + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + } + ASSERT_EQ(cache.getKeyCount(), 4); + + cache.evictExpired(); // nothing should be evicted — entries are within TTL + + ASSERT_EQ(cache.getKeyCount(), 4); + { + auto gstats = cache.getStats(); + ASSERT_EQ(gstats.total_entries, 4u); + ASSERT_EQ(gstats.total_bytes, 400u); + ASSERT_EQ(gstats.evicted_expired, 0u); + } + + auto pstats = cache.getPartitionStats(); + bool found = false; + for (const auto & ps : pstats) + { + if (ps.partition_id == partition_id) + { + found = true; + ASSERT_EQ(ps.entry_count, 4u); + ASSERT_EQ(ps.total_bytes, 400u); + } + } + ASSERT_TRUE(found) << "partition disappeared after no-op evictExpired: " << partition_id; +} + +// Verify evictOldestPartitionsUntilSpace() decrements partition_stats for evicted partition +TEST_F(DiskCacheTTLTest, SizeLimitEvictionUpdatesPartitionStats) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 1024 * 1024; + auto strategy = std::make_shared(settings); + + // ttl_minutes=0: no TTL rejection so we can use different-day partitions freely + DiskCacheTTL cache("test_size_pstats", "test-uuid-size", volume, nullptr, settings, strategy, 0, 100 * 1024 * 1024); + + time_t now = time(nullptr); + time_t yesterday = now - 25 * 3600; // definitely the previous calendar day + struct tm tm_now, tm_yest; + gmtime_r(&now, &tm_now); + gmtime_r(&yesterday, &tm_yest); + + String today_part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", tm_now.tm_year + 1900, tm_now.tm_mon + 1, tm_now.tm_mday); + String yest_part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", tm_yest.tm_year + 1900, tm_yest.tm_mon + 1, tm_yest.tm_mday); + + String today_pid = fmt::format("{:04d}{:02d}{:02d}", tm_now.tm_year + 1900, tm_now.tm_mon + 1, tm_now.tm_mday); + String yest_pid = fmt::format("{:04d}{:02d}{:02d}", tm_yest.tm_year + 1900, tm_yest.tm_mon + 1, tm_yest.tm_mday); + + if (today_pid == yest_pid) + GTEST_SKIP() << "test requires two distinct calendar days (running at midnight boundary)"; + + const String uuid = "test-uuid-size"; + const size_t seg_size = 1024; + + for (int i = 0; i < 4; i++) + { + String seg = fmt::format("{}/{}/col.bin/offset_{}", uuid, yest_part, i); + String data(seg_size, 'y'); + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + } + for (int i = 0; i < 3; i++) + { + String seg = fmt::format("{}/{}/col.bin/offset_{}", uuid, today_part, i); + String data(seg_size, 't'); + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + } + + ASSERT_EQ(cache.getKeyCount(), 7); + + { + auto pstats = cache.getPartitionStats(); + bool fy = false, ft = false; + for (const auto & ps : pstats) + { + if (ps.partition_id == yest_pid) { fy = true; ASSERT_EQ(ps.entry_count, 4u); } + if (ps.partition_id == today_pid) { ft = true; ASSERT_EQ(ps.entry_count, 3u); } + } + ASSERT_TRUE(fy) << "yesterday partition missing: " << yest_pid; + ASSERT_TRUE(ft) << "today partition missing: " << today_pid; + } + + // Free exactly 4 * seg_size bytes → should evict yesterday's 4 segments + cache.evictOldestPartitionsUntilSpace(4 * seg_size); + + ASSERT_EQ(cache.getKeyCount(), 3); + { + auto gstats = cache.getStats(); + ASSERT_EQ(gstats.total_entries, 3u); + ASSERT_EQ(gstats.total_bytes, 3 * seg_size); + ASSERT_EQ(gstats.evicted_size_limit, 4u); + ASSERT_EQ(gstats.evicted_expired, 0u); // TTL eviction was NOT used + } + + { + auto pstats = cache.getPartitionStats(); + for (const auto & ps : pstats) + { + if (ps.partition_id == yest_pid) + { + ASSERT_EQ(ps.entry_count, 0u) << "yesterday partition should be empty after eviction"; + ASSERT_EQ(ps.total_bytes, 0u); + } + if (ps.partition_id == today_pid) + { + ASSERT_EQ(ps.entry_count, 3u) << "today partition should be untouched"; + } + } + } +} + +// Verify part_index is correctly rebuilt after drop + re-add; also tests that +// cache_stats and partition_stats stay consistent across the full cycle. +TEST_F(DiskCacheTTLTest, PartIndexRebuildAfterDrop) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 1024 * 1024; + auto strategy = std::make_shared(settings); + + DiskCacheTTL cache("test_part_idx", "test-uuid-idx", volume, nullptr, settings, strategy, 60 * 24 * 365, 0); + + const String uuid = "test-uuid-idx"; + time_t now = time(nullptr); + struct tm tm; + gmtime_r(&now, &tm); + String part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday); + String partition_id = fmt::format("{:04d}{:02d}{:02d}", tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday); + const size_t seg_bytes = 64; + + // Phase 1: add 3 segments for 'part' + for (int i = 0; i < 3; i++) + { + String seg = fmt::format("{}/{}/col.bin/offset_{}", uuid, part, i); + String data(seg_bytes, 'a'); + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + } + ASSERT_EQ(cache.getKeyCount(), 3u); + { + auto gstats = cache.getStats(); + ASSERT_EQ(gstats.total_entries, 3u); + ASSERT_EQ(gstats.total_bytes, 3 * seg_bytes); + } + + // Phase 2: drop clears part_index entry for 'part' + cache.drop(uuid + "/" + part); + ASSERT_EQ(cache.getKeyCount(), 0u); + { + auto gstats = cache.getStats(); + ASSERT_EQ(gstats.total_entries, 0u); + ASSERT_EQ(gstats.total_bytes, 0u); + } + { + auto pstats = cache.getPartitionStats(); + for (const auto & ps : pstats) + if (ps.partition_id == partition_id) + { + ASSERT_EQ(ps.entry_count, 0u); + ASSERT_EQ(ps.total_bytes, 0u); + } + } + + // Phase 3: re-add 2 segments — part_index must be re-populated from scratch + for (int i = 0; i < 2; i++) + { + String seg = fmt::format("{}/{}/col.bin/offset_{}", uuid, part, i); + String data(seg_bytes, 'b'); + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false); + } + ASSERT_EQ(cache.getKeyCount(), 2u); + { + auto gstats = cache.getStats(); + ASSERT_EQ(gstats.total_entries, 2u); + ASSERT_EQ(gstats.total_bytes, 2 * seg_bytes); + } + + // All 2 segments must be retrievable + for (int i = 0; i < 2; i++) + { + String seg = fmt::format("{}/{}/col.bin/offset_{}", uuid, part, i); + auto [disk, path] = cache.get(seg); + ASSERT_FALSE(path.empty()) << "segment " << i << " not found after re-add"; + } + + // Phase 4: second drop — part_index entry removed again, stats zeroed + cache.drop(uuid + "/" + part); + ASSERT_EQ(cache.getKeyCount(), 0u); + { + auto gstats = cache.getStats(); + ASSERT_EQ(gstats.total_entries, 0u); + ASSERT_EQ(gstats.total_bytes, 0u); + } +} + +// --------------------------------------------------------------------------- +// drop() must evict FDB forward + reverse entries for the dropped part +// --------------------------------------------------------------------------- + +TEST_F(DiskCacheTTLTest, DropEvictsFDBEntries) +{ + auto volume = createTestVolume(); + DiskCacheSettings settings; + settings.ttl_cache_max_size = 1024 * 1024; + auto strategy = std::make_shared(settings); + + // No underscores in these strings so Catalog::escapeString is a no-op + const String uuid = "test-uuid-fdb"; + const String ns = "byconity"; + const String worker = "test-worker"; + const String key_prefix = ns + "_DCI_" + worker + "_" + uuid; + const String rev_key_prefix = ns + "_DCIREV_" + uuid; + + auto mock_store = std::make_shared(); + auto fdb_idx = std::make_shared(mock_store, ns, worker, uuid, worker); + + DiskCacheTTL cache("test_fdb_drop", uuid, volume, nullptr, settings, strategy, 60 * 24 * 365, 0); + cache.setFDBIndex(fdb_idx); + + time_t now = time(nullptr); + struct tm tm_now; + gmtime_r(&now, &tm_now); + String part = fmt::format("{:04d}{:02d}{:02d}_1_100_2", tm_now.tm_year + 1900, tm_now.tm_mon + 1, tm_now.tm_mday); + String partition_id = fmt::format("{:04d}{:02d}{:02d}", tm_now.tm_year + 1900, tm_now.tm_mon + 1, tm_now.tm_mday); + + const size_t seg_bytes = 64; + const int num_segs = 3; + + for (int i = 0; i < num_segs; i++) + { + String seg = makeSegKey(uuid, part, fmt::format("col{}", i), ".bin"); + String data(seg_bytes, 'a'); + ReadBufferFromString buf(data); + cache.set(seg, buf, data.size(), false, 0, now); + + // Seed FDB store manually (batchWrite in MockMetaStore is a no-op). + // hexKey layout: first 16 chars = hex(items[1]=low), last 16 = hex(items[0]=high). + auto key = DiskCacheTTL::hash(seg); + auto hex = DiskCacheTTL::hexKey(key); + String high_hex = hex.substr(16, 16); // items[0] = sipHash64(part_name) + String low_hex = hex.substr(0, 16); // items[1] = sipHash64(column) + mock_store->store[fmt::format("{}_{}_{}_{}", key_prefix, partition_id, high_hex, low_hex)] + = fmt::format("{}:{}:{}", static_cast(now), seg_bytes, seg); + mock_store->store[fmt::format("{}_{}_{}_{}", rev_key_prefix, partition_id, high_hex, low_hex)] + = worker; + } + + ASSERT_EQ(cache.getKeyCount(), static_cast(num_segs)); + ASSERT_EQ(mock_store->store.size(), static_cast(num_segs * 2)); // fwd + rev per segment + + cache.drop(uuid + "/" + part); + ASSERT_EQ(cache.getKeyCount(), 0u); + + // Flush pending evictPart ops: detach fdb_idx from cache then destroy it. + // The destructor sets stopped=true, drains the queue, and joins the bg thread. + cache.setFDBIndex(nullptr); + fdb_idx.reset(); + + EXPECT_TRUE(mock_store->store.empty()) + << "FDB entries not cleaned after drop(); remaining=" << mock_store->store.size(); +} + +} // namespace DB diff --git a/src/Storages/DiskCache/tests/gtest_ttl_cache_fdb_index_test.cpp b/src/Storages/DiskCache/tests/gtest_ttl_cache_fdb_index_test.cpp new file mode 100644 index 00000000000..52e24afb131 --- /dev/null +++ b/src/Storages/DiskCache/tests/gtest_ttl_cache_fdb_index_test.cpp @@ -0,0 +1,346 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace fs = std::filesystem; + +namespace DB +{ + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +static String fdbMakeSegKey(const String & uuid, const String & part, const String & col, const String & ext) +{ + return fmt::format("{}/{}/{}#0{}", uuid, part, col, ext); +} + +static String fdbTodayPart() +{ + time_t now = time(nullptr); + struct tm t; + gmtime_r(&now, &t); + return fmt::format("{:04d}{:02d}{:02d}_1_100_2", t.tm_year + 1900, t.tm_mon + 1, t.tm_mday); +} + +// --------------------------------------------------------------------------- +// Mock metastore — respects limit and start_key for pagination testing +// --------------------------------------------------------------------------- + +class FDBMockMetaStore : public Catalog::IMetaStore +{ +public: + struct MockIterator : public Iterator + { + std::vector> entries; + int pos = -1; + bool next() override { return ++pos < static_cast(entries.size()); } + String key() override { return entries[pos].first; } + String value() override { return entries[pos].second; } + }; + + void put(const String & key, const String & value, bool = false) override { store[key] = value; } + std::pair putCAS(const String &, const String &, const String &, bool) override { return {false, {}}; } + uint64_t get(const String & key, String & value) override + { + auto it = store.find(key); + if (it == store.end()) return 0; + value = it->second; + return 1; + } + std::vector> multiGet(const std::vector &) override { return {}; } + bool batchWrite(const Catalog::BatchCommitRequest & req, Catalog::BatchCommitResponse &) override + { + for (auto & d : req.deletes) + store.erase(d.key); + return true; + } + void drop(const String & key, const UInt64 &) override { store.erase(key); } + void drop(const String & key, const String &) override { store.erase(key); } + IteratorPtr getAll() override { return getByPrefix(""); } + IteratorPtr getByPrefix(const String & prefix, const size_t & limit = 0, uint32_t = 0, const String & start_key = "") override + { + auto iter = std::make_shared(); + for (auto & [k, v] : store) + { + if (!k.starts_with(prefix)) + continue; + if (!start_key.empty() && k < start_key) + continue; + iter->entries.emplace_back(k, v); + if (limit > 0 && iter->entries.size() >= limit) + break; + } + return iter; + } + IteratorPtr getByRange(const String &, const String &, bool, bool) override { return std::make_shared(); } + void clean(const String & prefix) override + { + for (auto it = store.begin(); it != store.end(); ) + it = it->first.starts_with(prefix) ? store.erase(it) : std::next(it); + } + void close() override {} + uint32_t getMaxBatchSize() override { return 1000; } + uint32_t getMaxKVSize() override { return 1024 * 1024; } + + std::map store; +}; + +// --------------------------------------------------------------------------- +// Test fixture +// --------------------------------------------------------------------------- + +class TTLCacheFDBIndexTest : public ::testing::Test +{ +public: + static void SetUpTestCase() + { + Poco::AutoPtr formatter(new Poco::PatternFormatter("%Y.%m.%d %H:%M:%S.%F <%p> %s: %t")); + Poco::AutoPtr console_channel(new Poco::ConsoleChannel); + Poco::AutoPtr channel(new Poco::FormattingChannel(formatter, console_channel)); + Poco::Logger::root().setLevel("warning"); + Poco::Logger::root().setChannel(channel); + ctx = getContext().context; + } + + static void TearDownTestCase() { ctx->shutdown(); } + + void SetUp() override + { + fs::remove_all("tmp_fdb/"); + fs::create_directories("tmp_fdb/ttl_disk/"); + UnitTest::initLogger(); + DB::IDiskCache::init(*getContext().context); + } + + void TearDown() override + { + fs::remove_all("tmp_fdb/"); + DB::IDiskCache::close(); + } + + VolumePtr createVolume() + { + auto disk = std::make_shared("fdb_ttl_disk", "tmp_fdb/ttl_disk/", DiskStats{}); + return std::make_shared("fdb_ttl_volume", std::move(disk), 0); + } + + DiskCacheSettings makeSettings(size_t max_bytes = 64 * 1024 * 1024) { + DiskCacheSettings s; + s.ttl_cache_max_size = max_bytes; + return s; + } + + static std::shared_ptr ctx; +}; + +std::shared_ptr TTLCacheFDBIndexTest::ctx = nullptr; + +// --------------------------------------------------------------------------- +// Helpers to seed the mock store with valid encoded FDB entries +// --------------------------------------------------------------------------- + +static void seedFDBEntry(FDBMockMetaStore & store, const String & key_prefix, + const String & fdb_key_suffix, const String & seg, size_t size, time_t ts) +{ + store.store[key_prefix + fdb_key_suffix] = fmt::format("{}:{}:{}", static_cast(ts), size, seg); +} + +// --------------------------------------------------------------------------- +// Test: all entries restored, on_reconcile_batch called +// --------------------------------------------------------------------------- + +TEST_F(TTLCacheFDBIndexTest, RestoresAllEntries) +{ + auto volume = createVolume(); + auto settings = makeSettings(); + auto strategy = std::make_shared(settings); + + const String uuid = "restore-uuid"; + const String ns = "ns", worker = "w1"; + const String kp = fmt::format("{}_DCI_{}_{}", ns, worker, uuid); + const time_t now = time(nullptr); + + auto mock = std::make_shared(); + const int N = 5; + std::vector segs; + for (int i = 0; i < N; ++i) + { + String seg = fdbMakeSegKey(uuid, fdbTodayPart(), fmt::format("col{}", i), ".bin"); + segs.push_back(seg); + seedFDBEntry(*mock, kp, fmt::format("_k{:04d}", i), seg, 64, now); + } + + TTLCacheFDBIndex idx(mock, ns, worker, uuid, worker); + DiskCacheTTL cache("rc", uuid, volume, nullptr, settings, strategy, 60 * 24, 0); + + size_t batch_calls = 0; + std::map> restored; + auto result = idx.reconcile( + volume, + [&](UInt128 key, const String & seg) { return cache.getRelativePath(key, seg); }, + [&](time_t ts) { return ts > now - 3600; }, + [&](TTLCacheFDBIndex::ReconcileBatch & batch) { + batch_calls++; + for (auto & [k, m] : batch) restored[k] = m; + } + ); + + ASSERT_TRUE(result.has_value()); + EXPECT_EQ(result->first, static_cast(N)); + EXPECT_EQ(restored.size(), static_cast(N)); + EXPECT_GE(batch_calls, 1u); + + for (auto & seg : segs) + EXPECT_NE(restored.find(DiskCacheTTL::hash(seg)), restored.end()) << "missing: " << seg; +} + +// --------------------------------------------------------------------------- +// Test: expired entries skipped and cleaned from FDB per page +// --------------------------------------------------------------------------- + +TEST_F(TTLCacheFDBIndexTest, StaleEntriesCleanedFromFDB) +{ + auto volume = createVolume(); + auto settings = makeSettings(); + auto strategy = std::make_shared(settings); + + const String uuid = "stale-uuid"; + const String ns = "ns", worker = "w1"; + const String kp = fmt::format("{}_DCI_{}_{}", ns, worker, uuid); + const time_t now = time(nullptr); + const time_t old_ts = now - 7 * 24 * 3600; // 7 days ago + + auto mock = std::make_shared(); + + // 3 fresh entries + for (int i = 0; i < 3; ++i) + seedFDBEntry(*mock, kp, fmt::format("_fresh_{:04d}", i), + fdbMakeSegKey(uuid, fdbTodayPart(), fmt::format("c{}", i), ".bin"), 64, now); + + // 2 expired entries + for (int i = 0; i < 2; ++i) + seedFDBEntry(*mock, kp, fmt::format("_stale_{:04d}", i), + fdbMakeSegKey(uuid, fdbTodayPart(), fmt::format("s{}", i), ".bin"), 64, old_ts); + + ASSERT_EQ(mock->store.size(), 5u); + + TTLCacheFDBIndex idx(mock, ns, worker, uuid, worker); + DiskCacheTTL cache("stale", uuid, volume, nullptr, settings, strategy, 60 * 24, 0); + + std::map> restored; + auto result = idx.reconcile( + volume, + [&](UInt128 key, const String & seg) { return cache.getRelativePath(key, seg); }, + [&](time_t ts) { return ts > now - 3600; }, // only very recent + [&](TTLCacheFDBIndex::ReconcileBatch & batch) { + for (auto & [k, m] : batch) restored[k] = m; + } + ); + + ASSERT_TRUE(result.has_value()); + EXPECT_EQ(result->first, 3u); + EXPECT_EQ(restored.size(), 3u); + + // Stale entries must have been deleted from the mock store + for (auto & [k, v] : mock->store) + EXPECT_EQ(k.find("_stale_"), String::npos) << "stale key not cleaned: " << k; +} + +// --------------------------------------------------------------------------- +// Test: pagination — entries spanning multiple pages all restored, no duplicates +// --------------------------------------------------------------------------- + +TEST_F(TTLCacheFDBIndexTest, PaginationRestoresAllEntries) +{ + auto volume = createVolume(); + auto settings = makeSettings(256 * 1024 * 1024); + auto strategy = std::make_shared(settings); + + const String uuid = "page-uuid"; + const String ns = "ns", worker = "w1"; + const String kp = fmt::format("{}_DCI_{}_{}", ns, worker, uuid); + const time_t now = time(nullptr); + + auto mock = std::make_shared(); + + // Seed PAGE_SIZE + 3 entries to force at least 2 pages (PAGE_SIZE = 100000). + // MockMetaStore respects limit + start_key, so pagination is exercised end-to-end. + const size_t PAGE_SIZE = 100'000; + const size_t TOTAL = PAGE_SIZE + 3; + for (size_t i = 0; i < TOTAL; ++i) + { + // Use zero-padded keys so std::map ordering matches FDB lexicographic ordering. + String seg = fdbMakeSegKey(uuid, fdbTodayPart(), fmt::format("col{:07d}", i), ".bin"); + seedFDBEntry(*mock, kp, fmt::format("_{:07d}", i), seg, 32, now); + } + ASSERT_EQ(mock->store.size(), TOTAL); + + TTLCacheFDBIndex idx(mock, ns, worker, uuid, worker); + DiskCacheTTL cache("page", uuid, volume, nullptr, settings, strategy, 60 * 24, 0); + + size_t batch_calls = 0; + std::map> restored; + auto result = idx.reconcile( + volume, + [&](UInt128 key, const String & seg) { return cache.getRelativePath(key, seg); }, + [&](time_t ts) { return ts > now - 3600; }, + [&](TTLCacheFDBIndex::ReconcileBatch & batch) { + batch_calls++; + for (auto & [k, m] : batch) restored[k] = m; + } + ); + + ASSERT_TRUE(result.has_value()); + EXPECT_EQ(result->first, TOTAL) << "got: " << result->first << ", want: " << TOTAL; + // No duplicates + EXPECT_EQ(restored.size(), TOTAL) << "duplicates detected: map size " << restored.size() << " vs total " << TOTAL; + // At least 2 batch calls (one per page) + EXPECT_GE(batch_calls, 2u) << "expected pagination but only got " << batch_calls << " batch call(s)"; +} + +// --------------------------------------------------------------------------- +// Test: empty FDB returns nullopt +// --------------------------------------------------------------------------- + +TEST_F(TTLCacheFDBIndexTest, EmptyFDBReturnsNullopt) +{ + auto volume = createVolume(); + auto settings = makeSettings(); + auto strategy = std::make_shared(settings); + + const String uuid = "empty-uuid"; + auto mock = std::make_shared(); + + TTLCacheFDBIndex idx(mock, "ns", "w1", uuid, "w1"); + DiskCacheTTL cache("empty", uuid, volume, nullptr, settings, strategy, 60, 0); + + bool batch_called = false; + auto result = idx.reconcile( + volume, + [&](UInt128 key, const String & seg) { return cache.getRelativePath(key, seg); }, + [](time_t) { return true; }, + [&](TTLCacheFDBIndex::ReconcileBatch &) { batch_called = true; } + ); + + EXPECT_FALSE(result.has_value()); + EXPECT_FALSE(batch_called); +} + +} // namespace DB diff --git a/src/Storages/MergeTree/MergeTreeDataPartCNCH.cpp b/src/Storages/MergeTree/MergeTreeDataPartCNCH.cpp index f05a21e9543..d1f3763cd10 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCNCH.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartCNCH.cpp @@ -355,7 +355,7 @@ void MergeTreeDataPartCNCH::loadFromFileSystem(bool load_hint_mutation) try { MetaInfoDiskCacheSegment metainfo_segment(shared_from_this()); - auto disk_cache = DiskCacheFactory::instance().get(DiskCacheType::MergeTree)->getMetaCache(); + auto disk_cache = storage.getDiskCache()->getMetaCache(); auto [cache_disk, segment_path] = disk_cache->get(metainfo_segment.getSegmentName()); if (cache_disk && cache_disk->exists(segment_path)) { @@ -389,7 +389,7 @@ void MergeTreeDataPartCNCH::loadFromFileSystem(bool load_hint_mutation) if (parent_part && enableDiskCache()) { auto segment = std::make_shared(shared_from_this()); - auto disk_cache = DiskCacheFactory::instance().get(DiskCacheType::MergeTree)->getMetaCache(); + auto disk_cache = storage.getDiskCache()->getMetaCache(); disk_cache->cacheSegmentsToLocalDisk({std::move(segment)}); } } @@ -753,7 +753,7 @@ IMergeTreeDataPart::IndexPtr MergeTreeDataPartCNCH::loadIndexFromStorage() const /// first try to load index from local disk cache if (enableDiskCache()) { - auto disk_cache = DiskCacheFactory::instance().get(DiskCacheType::MergeTree)->getMetaCache(); + auto disk_cache = storage.getDiskCache()->getMetaCache(); PrimaryIndexDiskCacheSegment segment(shared_from_this()); auto [cache_disk, segment_path] = disk_cache->get(segment.getSegmentName()); @@ -794,7 +794,7 @@ IMergeTreeDataPart::IndexPtr MergeTreeDataPartCNCH::loadIndexFromStorage() const if (enableDiskCache()) { auto index_seg = std::make_shared(shared_from_this()); - auto disk_cache = DiskCacheFactory::instance().get(DiskCacheType::MergeTree)->getMetaCache(); + auto disk_cache = storage.getDiskCache()->getMetaCache(); disk_cache->cacheSegmentsToLocalDisk({std::move(index_seg)}); } return res; @@ -812,7 +812,7 @@ IMergeTreeDataPart::ChecksumsPtr MergeTreeDataPartCNCH::loadChecksums([[maybe_un if (enableDiskCache()) { ChecksumsDiskCacheSegment checksums_segment(shared_from_this()); - auto disk_cache = DiskCacheFactory::instance().get(DiskCacheType::MergeTree)->getMetaCache(); + auto disk_cache = storage.getDiskCache()->getMetaCache(); auto [cache_disk, segment_path] = disk_cache->get(checksums_segment.getSegmentName()); if (cache_disk && cache_disk->exists(segment_path)) @@ -907,7 +907,7 @@ IMergeTreeDataPart::ChecksumsPtr MergeTreeDataPartCNCH::loadChecksumsFromRemote( if (enableDiskCache() && follow_part_chain) { auto segment = std::make_shared(shared_from_this()); - auto disk_cache = DiskCacheFactory::instance().get(DiskCacheType::MergeTree)->getMetaCache(); + auto disk_cache = storage.getDiskCache()->getMetaCache(); disk_cache->cacheSegmentsToLocalDisk({std::move(segment)}); } @@ -1237,8 +1237,9 @@ void MergeTreeDataPartCNCH::preload(UInt64 preload_level, UInt64 submit_ts) cons return; } - auto disk_cache = DiskCacheFactory::instance().get(DiskCacheType::MergeTree); + auto disk_cache = storage.getDiskCache(); auto cache_strategy = disk_cache->getStrategy(); + IDiskCache * mark_disk_cache = disk_cache->getMetaCache().get(); MarkRanges all_mark_ranges{MarkRange(0, getMarksCount())}; MarkCachePtr mark_cache_holder = storage.getContext()->getMarkCache(); @@ -1269,7 +1270,7 @@ void MergeTreeDataPartCNCH::preload(UInt64 preload_level, UInt64 submit_ts) cons PartFileDiskCacheSegment::FileOffsetAndSize{getFileOffsetOrZero(mark_file_name), getFileSizeOrZero(mark_file_name)}, getMarksCount(), mark_cache_holder.get(), - disk_cache->getMetaCache().get(), + mark_disk_cache, stream_name, DATA_FILE_EXTENSION, PartFileDiskCacheSegment::FileOffsetAndSize{getFileOffsetOrZero(data_file_name), getFileSizeOrZero(data_file_name)}, @@ -1370,13 +1371,33 @@ void MergeTreeDataPartCNCH::preload(UInt64 preload_level, UInt64 submit_ts) cons off_t mark_file_offset = source_data_part->getFileOffsetOrZero(mark_file_name); size_t mark_file_size = source_data_part->getFileSizeOrZero(mark_file_name); + if (mark_file_size == 0) + { + LOG_DEBUG( + storage.log, + "Skipping preload of index {} for part {}: not in checksums (written before index was added)", + index_name, + getFullRelativePath()); + continue; + } + + // Skip indexes have GRANULARITY N: one index mark per N primary-key marks. + // Their marks_count = ceil(data_marks / N), not getMarksCount() (which is data marks). + // Derive from the actual mark file size to avoid a size mismatch in MergeTreeMarksLoader. + size_t mark_size = source_data_part->index_granularity_info.getMarkSizeInBytes(1); + size_t skip_index_marks_count = mark_size > 0 ? mark_file_size / mark_size : 0; + if (skip_index_marks_count == 0) + continue; + + MarkRanges index_mark_ranges{MarkRange(0, skip_index_marks_count)}; + IDiskCacheSegmentsVector segs = cache_strategy->transferRangesToSegments( - all_mark_ranges, + index_mark_ranges, source_data_part, PartFileDiskCacheSegment::FileOffsetAndSize{mark_file_offset, mark_file_size}, - getMarksCount(), + skip_index_marks_count, mark_cache_holder.get(), - disk_cache->getMetaCache().get(), + mark_disk_cache, index_name, INDEX_FILE_EXTENSION, PartFileDiskCacheSegment::FileOffsetAndSize{data_file_offset, data_file_size}, @@ -1494,7 +1515,7 @@ void MergeTreeDataPartCNCH::preload(UInt64 preload_level, UInt64 submit_ts) cons std::unique_ptr part_helper = std::make_unique( getMvccDataPart(index_helper->getFileName() + INDEX_FILE_EXTENSION), - DiskCacheFactory::instance().get(DiskCacheType::MergeTree)->getMetaCache(), + storage.getDiskCache()->getMetaCache(), DiskCacheMode::USE_DISK_CACHE); factory->get(index_helper->getFileName(), std::move(part_helper)); } @@ -1548,7 +1569,7 @@ void MergeTreeDataPartCNCH::dropDiskCache(ThreadPool & pool, bool drop_vw_disk_c } auto part_log = storage.getContext()->getPartLog(storage.getDatabaseName()); - auto disk_cache = DiskCacheFactory::instance().get(DiskCacheType::MergeTree); + auto disk_cache = storage.getDiskCache(); auto cache_strategy = disk_cache->getStrategy(); auto impl = [part_log, part = shared_from_this(), part_base_path, disk_cache] { diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index aa361706b3e..4ccc9edc381 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -1978,13 +1978,13 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex( range.begin / index_granularity, (range.end + index_granularity - 1) / index_granularity); + total_granules += range.end - range.begin; + index_time_watcher.watch(IndexTimeWatcher::Type::SEEK, [&](){ if (last_index_mark != index_range.begin || !granule) reader.seek(index_range.begin); }); - total_granules += index_range.end - index_range.begin; - for (size_t index_mark = index_range.begin; index_mark < index_range.end; ++index_mark) { index_time_watcher.watch(IndexTimeWatcher::Type::READ, [&](){ @@ -2014,7 +2014,7 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex( if (!maybe_true) { - ++granules_dropped; + granules_dropped += data_range.end - data_range.begin; continue; } diff --git a/src/Storages/MergeTree/MergeTreeIndexReader.cpp b/src/Storages/MergeTree/MergeTreeIndexReader.cpp index 7270a2fc4bb..b8af3eb3f22 100644 --- a/src/Storages/MergeTree/MergeTreeIndexReader.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexReader.cpp @@ -20,7 +20,6 @@ */ #include -#include #include #include #include @@ -84,7 +83,7 @@ MergeTreeIndexReader::MergeTreeIndexReader( MergeTreeDataPartPtr source_data_part = part_->getMvccDataPart(index_name + INDEX_FILE_EXTENSION); if (source_data_part->enableDiskCache()) { - auto disk_cache = DiskCacheFactory::instance().get(DiskCacheType::MergeTree); + auto disk_cache = source_data_part->storage.getDiskCache(); segment_cache_strategy = disk_cache->getStrategy(); segment_cache = disk_cache; diff --git a/src/Storages/MergeTree/MergeTreeMarksLoader.cpp b/src/Storages/MergeTree/MergeTreeMarksLoader.cpp index a7523e092e6..f7482fc9494 100644 --- a/src/Storages/MergeTree/MergeTreeMarksLoader.cpp +++ b/src/Storages/MergeTree/MergeTreeMarksLoader.cpp @@ -171,7 +171,7 @@ MarkCache::MappedPtr MergeTreeMarksLoader::loadMarksImpl() if (local_cache_disk && local_cache_disk->exists(local_cache_path) && settings.read_settings.disk_cache_mode != DiskCacheMode::FORCE_STEAL_DISK_CACHE) { from_disk_cache = true; - LOG_TRACE(&Poco::Logger::get(__func__), "load from local disk cache {}, mrk_path {}", local_cache_disk->getPath(), local_cache_path); + LOG_TRACE(&Poco::Logger::get(__func__), "marks cache hit: seg_key={} disk={} path={}", mrk_seg_key, local_cache_disk->getPath(), local_cache_path); size_t cached_mark_file_size = local_cache_disk->getFileSize(local_cache_path); if (expected_file_size != cached_mark_file_size) throw Exception( @@ -230,7 +230,9 @@ MarkCache::MappedPtr MergeTreeMarksLoader::loadMarksImpl() } } - LOG_TRACE(&Poco::Logger::get(__func__), "load from remote filesystem mrk_path {}", mrk_path); + LOG_TRACE(&Poco::Logger::get(__func__), "marks cache miss: seg_key={} falling back to remote fs mrk_path={}", + IDiskCacheSegment::formatSegmentName(UUIDHelpers::UUIDToString(storage_uuid), part_name, stream_name, 0, index_granularity_info.marks_file_extension), + mrk_path); auto buf = disk->readFile(mrk_path, load_mark_read_settings); if (buf->seek(mark_file_offset) != mark_file_offset) throw Exception("Cannot seek to mark file " + mrk_path + " for stream " + stream_name, ErrorCodes::CANNOT_SEEK_THROUGH_FILE); diff --git a/src/Storages/MergeTree/MergeTreeReaderCNCH.cpp b/src/Storages/MergeTree/MergeTreeReaderCNCH.cpp index 7c0eb1f6ff5..c4bf3f31415 100644 --- a/src/Storages/MergeTree/MergeTreeReaderCNCH.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderCNCH.cpp @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -35,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -96,7 +96,7 @@ MergeTreeReaderCNCH::MergeTreeReaderCNCH( { if (data_part->enableDiskCache()) { - segment_cache = DiskCacheFactory::instance().get(DiskCacheType::MergeTree); + segment_cache = data_part->storage.getDiskCache(); segment_cache_strategy = segment_cache->getStrategy(); } diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index 76b98f6e3b7..cda4aa14dab 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -505,6 +505,8 @@ enum StealingCacheMode : UInt64 M(Bool, enable_parts_sync_preload, 0, "Enable sync preload parts", 0) \ M(Bool, enable_gc_evict_disk_cache, false, "Enable gc evict disk cache", 0) \ M(UInt64, disk_cache_stealing_mode, 0, "Read/write remote vw local disk cache if cur local disk cache empty, 0: close; 1: read 2: write 3: read&write", 0) \ + M(UInt64, disk_cache_ttl_hours, 0, "Per-table TTL cache: cache parts with max_timestamp within this age. 0 = disabled (use global LRU). >0 = enable per-table TTL cache.", 0) \ + M(UInt64, disk_cache_max_size_bytes, 0, "Per-table cache size limit in bytes. 0 = unlimited (constrained by global limit). Only applies when disk_cache_ttl_hours > 0.", 0) \ \ /* Renamed settings - cannot be ignored */\ M(Bool, enable_nullable_sorting_key, false, "Alias of `allow_nullable_key`", 0) \ diff --git a/src/Storages/MergeTree/MergedReadBufferWithSegmentCache.cpp b/src/Storages/MergeTree/MergedReadBufferWithSegmentCache.cpp index 65c3b571fdc..316e0817619 100644 --- a/src/Storages/MergeTree/MergedReadBufferWithSegmentCache.cpp +++ b/src/Storages/MergeTree/MergedReadBufferWithSegmentCache.cpp @@ -13,12 +13,16 @@ * limitations under the License. */ +#include #include #include #include #include #include +#include +#include #include +#include #include #include #include "Compression/CachedCompressedReadBuffer.h" @@ -149,11 +153,55 @@ MergedReadBufferWithSegmentCache::MergedReadBufferWithSegmentCache( total_segment_count(total_segment_count_), marks_loader(marks_loader_), current_segment_idx(0), current_compressed_offset(std::nullopt), part_host(part_host_), stream_extension(stream_extension_), - logger(&Poco::Logger::get("MergedReadBufferWithSegmentCache")) + is_idx(stream_extension_ == INDEX_FILE_EXTENSION), + logger(&Poco::Logger::get("MergedReadBufferWithSegmentCache")), + cached_query_id(CurrentThread::getQueryId().toString()) { + bool is_ttl_cache = dynamic_cast(segment_cache_) != nullptr; + if (is_ttl_cache) + { + if (auto ctx = CurrentThread::get().getQueryContext()) + collect_cache_stats = ctx->getSettingsRef().report_segment_profiles + || ctx->getSettingsRef().log_segment_profiles; + } + LOG_DEBUG(logger, "MergedReadBufferWithSegmentCache: part={} stream={} query_id={} is_ttl={} collect_stats={}", + part_name_, stream_name_, cached_query_id, is_ttl_cache, collect_cache_stats); initialize(); } +MergedReadBufferWithSegmentCache::~MergedReadBufferWithSegmentCache() +{ + try { flushLocalCacheStats(); } + catch (...) { tryLogCurrentException(logger, "flushLocalCacheStats in destructor"); } +} + +void MergedReadBufferWithSegmentCache::flushLocalCacheStats() +{ + if (!collect_cache_stats || local_cache_stats.empty()) + return; + // Close out any open segment timer + if (active_segment_start_ms > 0) + { + uint64_t elapsed = static_cast( + std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()).count()) - active_segment_start_ms; + if (is_idx) + { + if (active_is_cache) local_cache_stats.idx_cache_read_ms += elapsed; + else local_cache_stats.idx_s3_read_ms += elapsed; + } + else + { + if (active_is_cache) local_cache_stats.cache_read_ms += elapsed; + else local_cache_stats.s3_read_ms += elapsed; + } + active_segment_start_ms = 0; + } + if (!cached_query_id.empty()) + DiskCacheFactory::instance().mergeQueryCacheStats(cached_query_id, local_cache_stats); + local_cache_stats = {}; +} + void MergedReadBufferWithSegmentCache::initialize() { if (settings.read_settings.remote_read_log) settings.read_settings.remote_read_context = stream_name + stream_extension; @@ -190,6 +238,11 @@ bool MergedReadBufferWithSegmentCache::nextImpl() ProfileEvents::increment(ProfileEvents::CnchReadSizeFromDiskCache, buf_size); + if (collect_cache_stats) + { + if (is_idx) local_cache_stats.idx_cache_bytes += buf_size; + else local_cache_stats.cache_bytes += buf_size; + } if (progress_callback) progress_callback({0, 0, 0, 0, buf_size}); @@ -199,6 +252,32 @@ bool MergedReadBufferWithSegmentCache::nextImpl() current_compressed_offset = marks_loader.getMark(current_segment_idx * cache_segment_size).offset_in_compressed_file + cache_buffer.compressedOffset(); + // Segment boundary: stop timer, then flush to DiskCacheFactory immediately. + // Flushing per-boundary ensures stats are available even for LIMIT queries + // that cancel before reaching EOF. + if (collect_cache_stats && active_segment_start_ms > 0) + { + uint64_t elapsed = static_cast( + std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()).count()) - active_segment_start_ms; + if (is_idx) + { + if (active_is_cache) local_cache_stats.idx_cache_read_ms += elapsed; + else local_cache_stats.idx_s3_read_ms += elapsed; + } + else + { + if (active_is_cache) local_cache_stats.cache_read_ms += elapsed; + else local_cache_stats.s3_read_ms += elapsed; + } + active_segment_start_ms = 0; + } + if (collect_cache_stats && !local_cache_stats.empty()) + { + DiskCacheFactory::instance().mergeQueryCacheStats(cached_query_id, local_cache_stats); + local_cache_stats = {}; + } + cache_buffer.reset(); LOG_TRACE(logger, fmt::format("Cache buffer of segment {} encounter " @@ -245,6 +324,19 @@ bool MergedReadBufferWithSegmentCache::nextImpl() ProfileEvents::CnchReadSizeFromDiskCache : ProfileEvents::CnchReadSizeFromRemote, buf_size); + if (collect_cache_stats) + { + if (is_idx) + { + if (cache_buffer.initialized()) local_cache_stats.idx_cache_bytes += buf_size; + else local_cache_stats.idx_s3_bytes += buf_size; + } + else + { + if (cache_buffer.initialized()) local_cache_stats.cache_bytes += buf_size; + else local_cache_stats.s3_bytes += buf_size; + } + } if (cache_buffer.initialized() && progress_callback) progress_callback({0, 0, 0, 0, buf_size}); @@ -323,6 +415,16 @@ void MergedReadBufferWithSegmentCache::seekToPosition(size_t segment_idx, } // No segment cache, trying to use source reader + if (collect_cache_stats) + { + // For data: count s3_fallback_segs here (complements cache_miss_segs from seekToMarkInSegmentCache). + // For idx: miss already counted in seekToMarkInSegmentCache; skip here to avoid double-count. + if (!is_idx) ++local_cache_stats.s3_fallback_segs; + active_segment_start_ms = static_cast( + std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()).count()); + active_is_cache = false; + } initSourceBufferIfNeeded(); LOG_TRACE(logger, fmt::format("Seek to remote file {} in part {}, offset {}:{}, base offset {}, limit {}", @@ -370,11 +472,38 @@ bool MergedReadBufferWithSegmentCache::seekToMarkInSegmentCache(size_t segment_i if (settings.read_settings.disk_cache_mode == DiskCacheMode::FORCE_DISK_CACHE) throw Exception(ErrorCodes::DISK_CACHE_NOT_USED, "Can't find disk cache {} but enable `FORCE_DISK_CACHE`", segment_key); - if ((settings.remote_disk_cache_stealing == StealingCacheMode::READ_WRITE - || settings.remote_disk_cache_stealing == StealingCacheMode::READ_ONLY) - && parsed_assign_compute_host.has_value() && parsed_disk_cache_host.has_value() - && removeBracketsIfIpv6(parsed_assign_compute_host.value()) != removeBracketsIfIpv6(parsed_disk_cache_host.value())) - return seekToMarkInRemoteSegmentCache(segment_idx, mark_pos, segment_key); + if (settings.remote_disk_cache_stealing == StealingCacheMode::READ_WRITE + || settings.remote_disk_cache_stealing == StealingCacheMode::READ_ONLY) + { + // FDB-backed peer lookup: topology-aware, fires on scale events and post-restart gaps. + // Takes precedence over routing-hint stealing for TTL caches. + auto * ttl_cache = dynamic_cast(segment_cache); + if (ttl_cache) + { + if (auto peer_endpoint = ttl_cache->findPeerOwner(segment_key)) + { + bool stolen = seekToMarkInRemoteSegmentCache(segment_idx, mark_pos, segment_key, *peer_endpoint); + if (collect_cache_stats) + { + if (stolen) ++local_cache_stats.steal_segs; + else if (is_idx) ++local_cache_stats.idx_miss_segs; + else ++local_cache_stats.cache_miss_segs; + } + return stolen; + } + } + else if (parsed_assign_compute_host.has_value() && parsed_disk_cache_host.has_value() + && removeBracketsIfIpv6(parsed_assign_compute_host.value()) != removeBracketsIfIpv6(parsed_disk_cache_host.value())) + { + // Legacy routing-hint stealing for non-TTL caches. + return seekToMarkInRemoteSegmentCache(segment_idx, mark_pos, segment_key, {}); + } + } + if (collect_cache_stats) + { + if (is_idx) ++local_cache_stats.idx_miss_segs; + else ++local_cache_stats.cache_miss_segs; + } LOG_TRACE( logger, "Can't find disk cache key {} and fallback to read from remote fs. (current buffer at {}), segment {}, offset {}:{}", @@ -398,6 +527,15 @@ bool MergedReadBufferWithSegmentCache::seekToMarkInSegmentCache(size_t segment_i cache_buffer.seek(mark_pos.offset_in_compressed_file - segment_start_compressed_offset, mark_pos.offset_in_decompressed_block); current_segment_idx = segment_idx; + if (collect_cache_stats) + { + if (is_idx) ++local_cache_stats.idx_hit_segs; + else ++local_cache_stats.cache_hit_segs; + active_segment_start_ms = static_cast( + std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()).count()); + active_is_cache = true; + } } catch(...) { @@ -410,11 +548,15 @@ bool MergedReadBufferWithSegmentCache::seekToMarkInSegmentCache(size_t segment_i } bool MergedReadBufferWithSegmentCache::seekToMarkInRemoteSegmentCache(size_t segment_idx, - const MarkInCompressedFile& mark_pos, const String & segment_key) + const MarkInCompressedFile& mark_pos, const String & segment_key, const String & endpoint) { if (!segment_cache) return false; + const String & peer = endpoint.empty() ? part_host.disk_cache_host_port : endpoint; + if (peer.empty()) + return false; + DistributedDataClientOption option{ .max_request_rate = segment_cache->getSettings().stealing_max_request_rate, .connection_timeout_ms = segment_cache->getSettings().stealing_connection_timeout_ms, @@ -423,7 +565,7 @@ bool MergedReadBufferWithSegmentCache::seekToMarkInRemoteSegmentCache(size_t seg .retry_sleep_ms = segment_cache->getSettings().stealing_retry_sleep_ms, .max_queue_count = segment_cache->getSettings().stealing_max_queue_count, }; - auto remote_data_client = std::make_shared(part_host.disk_cache_host_port, segment_key, option); + auto remote_data_client = std::make_shared(peer, segment_key, option); auto remote_cache_file = std::make_unique(remote_data_client, settings.read_settings.remote_fs_buffer_size); if (remote_cache_file->getFileName().empty()) return false; @@ -435,7 +577,7 @@ bool MergedReadBufferWithSegmentCache::seekToMarkInRemoteSegmentCache(size_t seg logger, fmt::format( "Seek to remote diskcache {}:{} (current buffer at {}), segment {}, offset {}:{}", - part_host.disk_cache_host_port, + peer, remote_cache_file->getFileName(), cache_buffer.initialized() ? cache_buffer.path() : "Uninitialized", segment_idx, diff --git a/src/Storages/MergeTree/MergedReadBufferWithSegmentCache.h b/src/Storages/MergeTree/MergedReadBufferWithSegmentCache.h index 62e7e032d57..be0ca89d314 100644 --- a/src/Storages/MergeTree/MergedReadBufferWithSegmentCache.h +++ b/src/Storages/MergeTree/MergedReadBufferWithSegmentCache.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -49,6 +50,8 @@ class MergedReadBufferWithSegmentCache: public ReadBuffer clockid_t clock_type_ = CLOCK_MONOTONIC_COARSE, String stream_extension_ = DATA_FILE_EXTENSION); + ~MergedReadBufferWithSegmentCache() override; + virtual size_t readBig(char* to, size_t n) override; virtual bool nextImpl() override; @@ -124,7 +127,8 @@ class MergedReadBufferWithSegmentCache: public ReadBuffer void seekToPosition(size_t segment_idx, const MarkInCompressedFile& mark_pos); bool seekToMarkInSegmentCache(size_t segment_idx, const MarkInCompressedFile& mark_pos); void initialize(); - bool seekToMarkInRemoteSegmentCache(size_t segment_idx, const MarkInCompressedFile& mark_pos, const String & segment_key); + // endpoint: FDB-found peer address; empty = fall back to part_host.disk_cache_host_port + bool seekToMarkInRemoteSegmentCache(size_t segment_idx, const MarkInCompressedFile& mark_pos, const String & segment_key, const String & endpoint = {}); void initCacheBufferIfNeeded(const DiskPtr & disk, const String & path, std::unique_ptr remote_cache = nullptr); void initSourceBufferIfNeeded(); @@ -169,10 +173,21 @@ class MergedReadBufferWithSegmentCache: public ReadBuffer PartHostInfo part_host; String stream_extension; + bool is_idx{false}; // true when stream_extension == ".idx" (skip-index segment) Poco::Logger* logger; off_t read_until_position = 0; + + // Per-stream cache stats flushed to DiskCacheFactory registry at segment boundaries and in destructor. + // Only populated when segment_cache is a DiskCacheTTL instance AND query requested segment profiles. + bool collect_cache_stats{false}; + String cached_query_id; + QueryCacheStatsSnapshot local_cache_stats; + uint64_t active_segment_start_ms{0}; // wall-clock ms when current segment read started + bool active_is_cache{false}; // true = cache_buffer active, false = source_buffer + + void flushLocalCacheStats(); }; } diff --git a/src/Storages/PartCacheManager.cpp b/src/Storages/PartCacheManager.cpp index f2d3bde7044..11bad433730 100644 --- a/src/Storages/PartCacheManager.cpp +++ b/src/Storages/PartCacheManager.cpp @@ -1909,6 +1909,9 @@ PartCacheManager::getLastModificationTimeHints(const ConstStoragePtr & storage, } const auto * meta_storage = dynamic_cast(storage.get()); + if (!meta_storage) + throw Exception("Table is not a Meta Based MergeTree", ErrorCodes::UNKNOWN_TABLE); + auto meta_partitions = table_meta->getPartitionList(); // Skip if it passes TTL @@ -1922,8 +1925,6 @@ PartCacheManager::getLastModificationTimeHints(const ConstStoragePtr & storage, continue; Protos::LastModificationTimeHint hint = Protos::LastModificationTimeHint{}; - if (!meta_storage) - throw Exception("Table is not a Meta Based MergeTree", ErrorCodes::UNKNOWN_TABLE); String partition = partition_info->getPartitionValue(*meta_storage); diff --git a/src/Storages/StorageCloudMergeTree.cpp b/src/Storages/StorageCloudMergeTree.cpp index 1b9a67846d8..00a06cc4c3b 100644 --- a/src/Storages/StorageCloudMergeTree.cpp +++ b/src/Storages/StorageCloudMergeTree.cpp @@ -15,6 +15,7 @@ #include +#include #include #include "Core/UUID.h" #include "Storages/IStorage.h" @@ -43,6 +44,7 @@ #include #include #include +#include namespace ProfileEvents { @@ -103,6 +105,32 @@ void StorageCloudMergeTree::shutdown() dedup_worker->stop(); } +IDiskCachePtr StorageCloudMergeTree::getDiskCache() const +{ + // getDiskCache() is called per-part; compute the pointer once per storage lifetime. + std::call_once(disk_cache_flag, [this] + { + if (getSettings()->disk_cache_ttl_hours.value > 0) + { + disk_cache_ptr = DiskCacheFactory::instance().createDiskCacheFromTableSettings( + getStorageID().getFullNameNotQuoted(), + getStorageUUID(), + *getContext(), + getContext()->getDiskCacheThrottler(), + getSettings()->disk_cache_ttl_hours.value * 60, + getSettings()->disk_cache_max_size_bytes.value + ); + } + else + { + // TTL disabled — evict any stale registry entry so re-enabling picks up fresh settings. + DiskCacheFactory::instance().removeTableTTLCache(getStorageUUID()); + disk_cache_ptr = DiskCacheFactory::instance().get(DiskCacheType::MergeTree); + } + }); + return disk_cache_ptr; +} + StorageCloudMergeTree::~StorageCloudMergeTree() { } diff --git a/src/Storages/StorageCloudMergeTree.h b/src/Storages/StorageCloudMergeTree.h index 95e6cbc72c3..715bdcc3f14 100644 --- a/src/Storages/StorageCloudMergeTree.h +++ b/src/Storages/StorageCloudMergeTree.h @@ -26,6 +26,10 @@ namespace DB class CloudMergeTreeDedupWorker; using CloudMergeTreeDedupWorkerPtr = std::unique_ptr; + +class IDiskCache; +using IDiskCachePtr = std::shared_ptr; + namespace IngestColumnCnch { struct IngestPartitionParam; @@ -105,6 +109,8 @@ class StorageCloudMergeTree : public shared_ptr_helper, p CloudMergeTreeDedupWorker * tryGetDedupWorker() { return dedup_worker.get(); } CloudMergeTreeDedupWorker * getDedupWorker(); + IDiskCachePtr getDiskCache() const override; + QueryProcessingStage::Enum getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override; bool getQueryProcessingStageWithAggregateProjection(ContextPtr query_context, const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info) const; @@ -136,6 +142,10 @@ class StorageCloudMergeTree : public shared_ptr_helper, p std::set required_bucket_numbers; CloudMergeTreeDedupWorkerPtr dedup_worker; + + // Cached per-query disk cache pointer — getDiskCache() is called per-part so compute once. + mutable std::once_flag disk_cache_flag; + mutable IDiskCachePtr disk_cache_ptr; }; } diff --git a/src/Storages/StorageCnchMergeTree.cpp b/src/Storages/StorageCnchMergeTree.cpp index 67d48a49c66..86cccc8e428 100644 --- a/src/Storages/StorageCnchMergeTree.cpp +++ b/src/Storages/StorageCnchMergeTree.cpp @@ -96,6 +96,8 @@ #include #include #include +#include +#include namespace ProfileEvents @@ -259,13 +261,6 @@ QueryProcessingStage::Enum StorageCnchMergeTree::getQueryProcessingStage( } } -void StorageCnchMergeTree::startup() -{ -} - -void StorageCnchMergeTree::shutdown() -{ -} Pipe StorageCnchMergeTree::read( const Names & column_names, diff --git a/src/Storages/StorageCnchMergeTree.h b/src/Storages/StorageCnchMergeTree.h index 31d8bd4ee55..cf6d39ff0a7 100644 --- a/src/Storages/StorageCnchMergeTree.h +++ b/src/Storages/StorageCnchMergeTree.h @@ -31,6 +31,8 @@ namespace DB struct PrepareContextResult; class ASTSystemQuery; +class IDiskCache; +using IDiskCachePtr = std::shared_ptr; class StorageCnchMergeTree final : public shared_ptr_helper, public MergeTreeMetaBase, public CnchStorageCommonHelper { @@ -69,8 +71,7 @@ class StorageCnchMergeTree final : public shared_ptr_helper getDefaultSettings() const override; + +private: }; using StorageCnchMergeTreePtr = std::shared_ptr; diff --git a/src/Storages/System/StorageSystemDiskTTLCachePartitions.cpp b/src/Storages/System/StorageSystemDiskTTLCachePartitions.cpp new file mode 100644 index 00000000000..d4b70784870 --- /dev/null +++ b/src/Storages/System/StorageSystemDiskTTLCachePartitions.cpp @@ -0,0 +1,111 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +NamesAndTypesList StorageSystemDiskTTLCachePartitions::getNamesAndTypes() +{ + return { + {"worker_id", std::make_shared()}, + {"table_name", std::make_shared()}, + {"table_uuid", std::make_shared()}, + {"partition", std::make_shared()}, + {"entry_count", std::make_shared()}, + {"bytes", std::make_shared()}, + }; +} + +StorageSystemDiskTTLCachePartitions::StorageSystemDiskTTLCachePartitions(const StorageID & table_id_) + : IStorageSystemOneBlock(table_id_) +{ +} + +static void fillPartitionRow(MutableColumns & res_columns, const String & worker_id, const Protos::TTLCachePartitionStats & p) +{ + size_t col_idx = 0; + res_columns[col_idx++]->insert(worker_id); + res_columns[col_idx++]->insert(p.table_name()); + res_columns[col_idx++]->insert(p.table_uuid()); + res_columns[col_idx++]->insert(p.partition()); + res_columns[col_idx++]->insert(p.entry_count()); + res_columns[col_idx++]->insert(p.bytes()); +} + +void StorageSystemDiskTTLCachePartitions::fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const +{ + if (context->getServerType() == ServerType::cnch_server) + { + auto * log = &Poco::Logger::get("StorageSystemDiskTTLCachePartitions"); + std::vector all_workers; + try + { + auto rm_client = context->getResourceManagerClient(); + if (!rm_client) + { + LOG_WARNING(log, "ResourceManager client unavailable, returning empty result"); + return; + } + rm_client->getAllWorkers(all_workers); + } + catch (...) + { + tryLogCurrentException(log, "Failed to get workers from ResourceManager"); + return; + } + + LOG_INFO(log, "Querying TTL partition stats from {} worker(s)", all_workers.size()); + auto & pools = context->getCnchWorkerClientPools(); + for (const auto & wd : all_workers) + { + if (wd.vw_name == ResourceManagement::toSystemVWName(ResourceManagement::VirtualWarehouseType::Write)) + continue; + try + { + auto worker = pools.getWorker(wd.host_ports); + auto partitions = worker->getTTLCachePartitionStats(); + for (const auto & p : partitions) + fillPartitionRow(res_columns, wd.id.empty() ? wd.host_ports.getRPCAddress() : wd.id, p); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + } + return; + } + + // On worker: read directly from local DiskCacheFactory registry + String worker_id = getWorkerID(context); + auto ttl_caches = DiskCacheFactory::instance().getAllTableTTLCaches(); + for (const auto & [uuid, cache_ptr] : ttl_caches) + { + auto * ttl_cache = dynamic_cast(cache_ptr.get()); + if (!ttl_cache) + continue; + + auto table_stats = ttl_cache->getStats(); + for (const auto & ps : ttl_cache->getPartitionStats()) + { + Protos::TTLCachePartitionStats p; + p.set_table_name(ttl_cache->getName()); + p.set_table_uuid(table_stats.table_uuid); + p.set_partition(ps.partition_id); + p.set_entry_count(ps.entry_count); + p.set_bytes(ps.total_bytes); + fillPartitionRow(res_columns, worker_id, p); + } + } +} + +} diff --git a/src/Storages/System/StorageSystemDiskTTLCachePartitions.h b/src/Storages/System/StorageSystemDiskTTLCachePartitions.h new file mode 100644 index 00000000000..4b1bd3005b3 --- /dev/null +++ b/src/Storages/System/StorageSystemDiskTTLCachePartitions.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class Context; + +/** Implements system table disk_ttl_cache_partitions + * Shows per-partition TTL disk cache statistics (hits, misses, size) + */ +class StorageSystemDiskTTLCachePartitions final : public shared_ptr_helper, + public IStorageSystemOneBlock +{ + friend struct shared_ptr_helper; +public: + std::string getName() const override { return "SystemDiskTTLCachePartitions"; } + + static NamesAndTypesList getNamesAndTypes(); + +protected: + StorageSystemDiskTTLCachePartitions(const StorageID & table_id_); + + void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo & query_info) const override; +}; + +} diff --git a/src/Storages/System/StorageSystemDiskTTLCachePreloads.cpp b/src/Storages/System/StorageSystemDiskTTLCachePreloads.cpp new file mode 100644 index 00000000000..b2408e2e224 --- /dev/null +++ b/src/Storages/System/StorageSystemDiskTTLCachePreloads.cpp @@ -0,0 +1,106 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +NamesAndTypesList StorageSystemDiskTTLCachePreloads::getNamesAndTypes() +{ + return { + {"worker_id", std::make_shared()}, + {"table_name", std::make_shared()}, + {"table_uuid", std::make_shared()}, + {"partition_id", std::make_shared()}, + {"parts_in_flight", std::make_shared()}, + {"parts_submitted", std::make_shared()}, + {"elapsed_ms", std::make_shared()}, + {"preload_level", std::make_shared()}, + }; +} + +StorageSystemDiskTTLCachePreloads::StorageSystemDiskTTLCachePreloads(const StorageID & table_id_) + : IStorageSystemOneBlock(table_id_) +{ +} + +static void fillPreloadRow(MutableColumns & res_columns, const String & worker_id, const Protos::PreloadPartitionStats & p) +{ + size_t col_idx = 0; + res_columns[col_idx++]->insert(worker_id); + res_columns[col_idx++]->insert(p.table_name()); + res_columns[col_idx++]->insert(p.table_uuid()); + res_columns[col_idx++]->insert(p.partition_id()); + res_columns[col_idx++]->insert(p.parts_in_flight()); + res_columns[col_idx++]->insert(p.parts_submitted()); + res_columns[col_idx++]->insert(p.elapsed_ms()); + res_columns[col_idx++]->insert(p.preload_level()); +} + +void StorageSystemDiskTTLCachePreloads::fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const +{ + if (context->getServerType() == ServerType::cnch_server) + { + auto * log = &Poco::Logger::get("StorageSystemDiskTTLCachePreloads"); + std::vector all_workers; + try + { + auto rm_client = context->getResourceManagerClient(); + if (!rm_client) + { + LOG_WARNING(log, "ResourceManager client unavailable, returning empty result"); + return; + } + rm_client->getAllWorkers(all_workers); + } + catch (...) + { + tryLogCurrentException(log, "Failed to get workers from ResourceManager"); + return; + } + + auto & pools = context->getCnchWorkerClientPools(); + for (const auto & wd : all_workers) + { + if (wd.vw_name == ResourceManagement::toSystemVWName(ResourceManagement::VirtualWarehouseType::Write)) + continue; + try + { + auto worker = pools.getWorker(wd.host_ports); + auto partitions = worker->getPreloadStats(); + for (const auto & p : partitions) + fillPreloadRow(res_columns, wd.id.empty() ? wd.host_ports.getRPCAddress() : wd.id, p); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + } + return; + } + + // On worker: read directly from PreloadRegistry + String worker_id = getWorkerID(context); + for (const auto & snap : PreloadRegistry::instance().getSnapshot()) + { + Protos::PreloadPartitionStats p; + p.set_table_name(snap.table_name); + p.set_table_uuid(snap.table_uuid); + p.set_partition_id(snap.partition_id); + p.set_parts_in_flight(snap.parts_in_flight); + p.set_parts_submitted(snap.parts_submitted); + p.set_elapsed_ms(snap.elapsed_ms); + p.set_preload_level(snap.preload_level); + fillPreloadRow(res_columns, worker_id, p); + } +} + +} diff --git a/src/Storages/System/StorageSystemDiskTTLCachePreloads.h b/src/Storages/System/StorageSystemDiskTTLCachePreloads.h new file mode 100644 index 00000000000..b6264b61b09 --- /dev/null +++ b/src/Storages/System/StorageSystemDiskTTLCachePreloads.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class Context; + +/** Implements system table disk_ttl_cache_preloads + * Shows in-flight async preload tasks grouped by (worker, table, partition) + */ +class StorageSystemDiskTTLCachePreloads final : public shared_ptr_helper, + public IStorageSystemOneBlock +{ + friend struct shared_ptr_helper; +public: + std::string getName() const override { return "SystemDiskTTLCachePreloads"; } + + static NamesAndTypesList getNamesAndTypes(); + +protected: + StorageSystemDiskTTLCachePreloads(const StorageID & table_id_); + + void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo & query_info) const override; +}; + +} diff --git a/src/Storages/System/StorageSystemDiskTTLCacheTables.cpp b/src/Storages/System/StorageSystemDiskTTLCacheTables.cpp new file mode 100644 index 00000000000..667ea90d99b --- /dev/null +++ b/src/Storages/System/StorageSystemDiskTTLCacheTables.cpp @@ -0,0 +1,202 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +NamesAndTypesList StorageSystemDiskTTLCacheTables::getNamesAndTypes() +{ + return { + {"worker_id", std::make_shared()}, + {"table_name", std::make_shared()}, + {"table_uuid", std::make_shared()}, + {"ttl_minutes", std::make_shared()}, + {"max_size_bytes", std::make_shared()}, + {"last_eviction_run", std::make_shared()}, + {"eviction_stats", std::make_shared(std::make_shared(), std::make_shared())}, + {"rejection_stats", std::make_shared(std::make_shared(), std::make_shared())}, + {"write_stats", std::make_shared(std::make_shared(), std::make_shared())}, + {"hit_stats", std::make_shared(std::make_shared(), std::make_shared())}, + }; +} + +StorageSystemDiskTTLCacheTables::StorageSystemDiskTTLCacheTables(const StorageID & table_id_) + : IStorageSystemOneBlock(table_id_) +{ +} + +static void dumpStatsToMapColumn(const std::unordered_map & map, IColumn * column) +{ + auto * column_map = column ? &typeid_cast(*column) : nullptr; + if (!column_map) + return; + + auto & offsets = column_map->getOffsets(); + auto & key_column = column_map->getKey(); + auto & value_column = column_map->getValue(); + + size_t size = 0; + for (const auto & entry : map) + { + key_column.insertData(entry.first.c_str(), entry.first.size()); + value_column.insert(entry.second); + size++; + } + + offsets.push_back((offsets.size() == 0 ? 0 : offsets.back()) + size); +} + +static void fillRowFromProto(MutableColumns & res_columns, const String & worker_id, const Protos::TTLCacheTableStats & t) +{ + size_t col_idx = 0; + + res_columns[col_idx++]->insert(worker_id); + res_columns[col_idx++]->insert(t.table_name()); + res_columns[col_idx++]->insert(t.table_uuid()); + res_columns[col_idx++]->insert(t.ttl_minutes()); + res_columns[col_idx++]->insert(t.max_size_bytes()); + res_columns[col_idx++]->insert(t.last_eviction_run()); + + { + std::unordered_map eviction_map; + eviction_map["expired"] = t.evicted_expired(); + eviction_map["size_limit"] = t.evicted_size_limit(); + eviction_map["async_triggered_evicted"] = t.async_triggered_evicted(); + eviction_map["async_skipped_rate_limit_evicted"] = t.async_skipped_rate_limit_evicted(); + dumpStatsToMapColumn(eviction_map, res_columns[col_idx++].get()); + } + + { + std::unordered_map rejection_map; + rejection_map["non_time_partition"] = t.rejected_non_time_partition(); + rejection_map["too_old"] = t.rejected_too_old(); + dumpStatsToMapColumn(rejection_map, res_columns[col_idx++].get()); + } + + { + std::unordered_map write_map; + write_map["count_preload"] = t.count_preload(); + write_map["count_query"] = t.count_query(); + write_map["bytes_preload"] = t.bytes_preload(); + write_map["bytes_query"] = t.bytes_query(); + write_map["count_restored"] = t.count_restored(); + write_map["bytes_restored"] = t.bytes_restored(); + write_map["idx_count_preload"] = t.idx_count_preload(); + write_map["idx_bytes_preload"] = t.idx_bytes_preload(); + write_map["idx_count_query"] = t.idx_count_query(); + write_map["idx_bytes_query"] = t.idx_bytes_query(); + dumpStatsToMapColumn(write_map, res_columns[col_idx++].get()); + } + { + std::unordered_map hit_map; + hit_map["data_hits"] = t.data_hits(); + hit_map["data_misses"] = t.data_misses(); + hit_map["idx_hits"] = t.idx_hits(); + hit_map["idx_misses"] = t.idx_misses(); + dumpStatsToMapColumn(hit_map, res_columns[col_idx++].get()); + } +} + +void StorageSystemDiskTTLCacheTables::fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const +{ + if (context->getServerType() == ServerType::cnch_server) + { + // Fan out to all workers via RPC using RM worker list — same pattern as system.workers. + // This works in any context without requiring a VW to be set. + auto * log = &Poco::Logger::get("StorageSystemDiskTTLCacheTables"); + std::vector all_workers; + try + { + auto rm_client = context->getResourceManagerClient(); + if (!rm_client) + { + LOG_WARNING(log, "ResourceManager client unavailable, returning empty result"); + return; + } + rm_client->getAllWorkers(all_workers); + } + catch (...) + { + tryLogCurrentException(log, "Failed to get workers from ResourceManager"); + return; + } + + LOG_INFO(log, "Querying TTL cache stats from {} worker(s)", all_workers.size()); + auto & pools = context->getCnchWorkerClientPools(); + for (const auto & wd : all_workers) + { + if (wd.vw_name == ResourceManagement::toSystemVWName(ResourceManagement::VirtualWarehouseType::Write)) + continue; + LOG_INFO(log, "Sending getTTLCacheStats RPC to {}", wd.host_ports.getRPCAddress()); + try + { + auto worker = pools.getWorker(wd.host_ports); + auto stats = worker->getTTLCacheStats(); + LOG_INFO(log, "Got {} TTL cache entries from {}", stats.size(), wd.host_ports.getRPCAddress()); + for (const auto & t : stats) + fillRowFromProto(res_columns, wd.id.empty() ? wd.host_ports.getRPCAddress() : wd.id, t); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + } + return; + } + + // On worker: read directly from local DiskCacheFactory registry + String worker_id = getWorkerID(context); + auto ttl_caches = DiskCacheFactory::instance().getAllTableTTLCaches(); + for (const auto & [uuid, cache_ptr] : ttl_caches) + { + auto * ttl_cache = dynamic_cast(cache_ptr.get()); + if (!ttl_cache) + continue; + + auto stats = ttl_cache->getStats(); + + Protos::TTLCacheTableStats t; + t.set_table_name(ttl_cache->getName()); + t.set_table_uuid(stats.table_uuid); + t.set_ttl_minutes(ttl_cache->getTTLMinutes()); + t.set_max_size_bytes(ttl_cache->getMaxSizeBytes()); + t.set_last_eviction_run(stats.last_eviction_run); + t.set_evicted_expired(stats.evicted_expired); + t.set_evicted_size_limit(stats.evicted_size_limit); + t.set_async_triggered_evicted(stats.async_eviction_triggered); + t.set_async_skipped_rate_limit_evicted(stats.async_eviction_skipped_rate_limit); + t.set_rejected_non_time_partition(stats.rejected_non_time_partition); + t.set_rejected_too_old(stats.rejected_too_old); + t.set_count_preload(stats.cached_from_preload); + t.set_count_query(stats.cached_from_query); + t.set_bytes_preload(stats.cached_bytes_preload); + t.set_bytes_query(stats.cached_bytes_query); + t.set_count_restored(stats.cached_from_restored); + t.set_bytes_restored(stats.cached_bytes_restored); + t.set_idx_count_preload(stats.cached_idx_from_preload); + t.set_idx_bytes_preload(stats.cached_idx_bytes_preload); + t.set_idx_count_query(stats.cached_idx_from_query); + t.set_idx_bytes_query(stats.cached_idx_bytes_query); + t.set_data_hits(stats.data_hits); + t.set_data_misses(stats.data_misses); + t.set_idx_hits(stats.idx_hits); + t.set_idx_misses(stats.idx_misses); + + fillRowFromProto(res_columns, worker_id, t); + } +} + +} diff --git a/src/Storages/System/StorageSystemDiskTTLCacheTables.h b/src/Storages/System/StorageSystemDiskTTLCacheTables.h new file mode 100644 index 00000000000..d5f1255aa1e --- /dev/null +++ b/src/Storages/System/StorageSystemDiskTTLCacheTables.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class Context; + +/** Implements system table disk_ttl_cache_tables + * Shows per-table TTL disk cache statistics + */ +class StorageSystemDiskTTLCacheTables final : public shared_ptr_helper, + public IStorageSystemOneBlock +{ + friend struct shared_ptr_helper; +public: + std::string getName() const override { return "SystemDiskTTLCacheTables"; } + + static NamesAndTypesList getNamesAndTypes(); + +protected: + StorageSystemDiskTTLCacheTables(const StorageID & table_id_); + + void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo & query_info) const override; +}; + +} diff --git a/src/Storages/System/attachSystemTables.cpp b/src/Storages/System/attachSystemTables.cpp index ef4d243666d..19fe4cee1c7 100644 --- a/src/Storages/System/attachSystemTables.cpp +++ b/src/Storages/System/attachSystemTables.cpp @@ -162,6 +162,9 @@ #include #include #include +#include +#include +#include namespace DB { @@ -310,6 +313,9 @@ void attachSystemTablesServer(IDatabase & system_database, bool has_zookeeper) #endif attach(system_database, "cnch_transaction_clean_tasks"); attach(system_database, "schema_inference_cache"); + attach(system_database, "disk_ttl_cache_tables"); + attach(system_database, "disk_ttl_cache_partitions"); + attach(system_database, "disk_ttl_cache_preloads"); } void attachSystemTablesAsync(IDatabase & system_database, AsynchronousMetrics & async_metrics) diff --git a/src/Transaction/Actions/InsertAction.cpp b/src/Transaction/Actions/InsertAction.cpp index a763cc8a76e..b8aebf718e8 100644 --- a/src/Transaction/Actions/InsertAction.cpp +++ b/src/Transaction/Actions/InsertAction.cpp @@ -149,8 +149,8 @@ void InsertAction::checkAndSetDedupMode(CnchDedupHelper::DedupMode dedup_mode_) throw Exception( ErrorCodes::LOGICAL_ERROR, "Dedup mode is {}, but staged parts are not empty for table {}, it's a bug!", - table->getCnchStorageID().getNameForLogs(), - typeToString(dedup_mode_)); + typeToString(dedup_mode_), + table->getCnchStorageID().getNameForLogs()); LOG_TRACE(log, "Table {} is in {} mode.", table->getCnchStorageID().getNameForLogs(), typeToString(dedup_mode_)); }