From 5fe6b68b89f0d7d2ac05a647cb9832e052111219 Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Wed, 24 Sep 2025 09:24:56 +0200 Subject: [PATCH 01/10] Checker: never reschedule checkables with already running checks This commit changes the ordering of CheckableScheduleInfo in the multi-index container to ensure that checkables with running checks are pushed to the end of the ordering. This prevents them from being prioritized for scheduling ahead of others, which could lead to unnecessary CPU load due to repeated scheduling attempts. By using a very large value for the index of checkables with running checks, they are effectively deprioritized until their current check is completed and they can be reinserted with their actual next check time. --- lib/checker/checkercomponent.hpp | 28 ++++++++++++++++------------ lib/icinga/checkable-check.cpp | 30 ++++++++++++++++++------------ lib/icinga/checkable.hpp | 5 +++-- 3 files changed, 37 insertions(+), 26 deletions(-) diff --git a/lib/checker/checkercomponent.hpp b/lib/checker/checkercomponent.hpp index edd3775e58..6f7a237159 100644 --- a/lib/checker/checkercomponent.hpp +++ b/lib/checker/checkercomponent.hpp @@ -26,21 +26,25 @@ struct CheckableScheduleInfo { Checkable::Ptr Object; double NextCheck; -}; - -/** - * @ingroup checker - */ -struct CheckableNextCheckExtractor -{ - typedef double result_type; /** - * @threadsafety Always. + * Get the index value for ordering in the multi-index container. + * + * This function returns a very large value for checkables that have a running check, effectively pushing + * them to the end of the ordering. This ensures that checkables with running checks are not prioritized + * for scheduling ahead of others. Rescheduling of such checkables is unnecessary because the checkable + * is going to reject this anyway if it notices that a check is already running, so avoiding unnecessary + * CPU load. Once the running check is finished, the checkable will be re-inserted into the set with its + * actual next check time as the index value. + * + * @return The index value for ordering in the multi-index container. */ - double operator()(const CheckableScheduleInfo& csi) + double Index() const { - return csi.NextCheck; + if (Object->HasRunningCheck()) { + return std::numeric_limits::max(); + } + return NextCheck; } }; @@ -57,7 +61,7 @@ class CheckerComponent final : public ObjectImpl CheckableScheduleInfo, boost::multi_index::indexed_by< boost::multi_index::ordered_unique >, - boost::multi_index::ordered_non_unique + boost::multi_index::ordered_non_unique> > > CheckableSet; diff --git a/lib/icinga/checkable-check.cpp b/lib/icinga/checkable-check.cpp index fe7b651bed..34b0005e3e 100644 --- a/lib/icinga/checkable-check.cpp +++ b/lib/icinga/checkable-check.cpp @@ -45,7 +45,7 @@ void Checkable::SetSchedulingOffset(long offset) m_SchedulingOffset = offset; } -long Checkable::GetSchedulingOffset() +long Checkable::GetSchedulingOffset() const { return m_SchedulingOffset; } @@ -86,6 +86,11 @@ bool Checkable::HasBeenChecked() const return GetLastCheckResult() != nullptr; } +bool Checkable::HasRunningCheck() const +{ + return m_CheckRunning; +} + double Checkable::GetLastCheck() const { CheckResult::Ptr cr = GetLastCheckResult(); @@ -105,7 +110,7 @@ Checkable::ProcessingResult Checkable::ProcessCheckResult(const CheckResult::Ptr VERIFY(producer); ObjectLock olock(this); - m_CheckRunning = false; + m_CheckRunning.store(false); double now = Utility::GetTime(); @@ -561,6 +566,10 @@ void Checkable::ExecuteCheck(const WaitGroup::Ptr& producer) { CONTEXT("Executing check for object '" << GetName() << "'"); + /* don't run another check if there is one pending */ + if (m_CheckRunning.exchange(true)) + return; // Should never happen as the checker already takes care of this. + /* keep track of scheduling info in case the check type doesn't provide its own information */ double scheduled_start = GetNextCheck(); double before_check = Utility::GetTime(); @@ -578,12 +587,6 @@ void Checkable::ExecuteCheck(const WaitGroup::Ptr& producer) { ObjectLock olock(this); - /* don't run another check if there is one pending */ - if (m_CheckRunning) - return; - - m_CheckRunning = true; - SetLastStateRaw(GetStateRaw()); SetLastStateType(GetLastStateType()); SetLastReachable(reachable); @@ -669,10 +672,13 @@ void Checkable::ExecuteCheck(const WaitGroup::Ptr& producer) ProcessCheckResult(cr, producer); } - { - ObjectLock olock(this); - m_CheckRunning = false; - } + /** + * If this is a remote check, we don't know when the check result will be received and processed. + * Therefore, we must mark the check as no longer running here, otherwise, no further checks + * would be executed for this checkable as it would always appear as having a running check + * (see the check at the start of this function). + */ + m_CheckRunning.store(false); } } diff --git a/lib/icinga/checkable.hpp b/lib/icinga/checkable.hpp index 796421a9b6..9141dcf482 100644 --- a/lib/icinga/checkable.hpp +++ b/lib/icinga/checkable.hpp @@ -101,7 +101,7 @@ class Checkable : public ObjectImpl intrusive_ptr GetCheckCommand() const; TimePeriod::Ptr GetCheckPeriod() const; - long GetSchedulingOffset(); + long GetSchedulingOffset() const; void SetSchedulingOffset(long offset); void UpdateNextCheck(const MessageOrigin::Ptr& origin = nullptr); @@ -109,6 +109,7 @@ class Checkable : public ObjectImpl static String StateTypeToString(StateType type); bool HasBeenChecked() const; + bool HasRunningCheck() const; virtual bool IsStateOK(ServiceState state) const = 0; double GetLastCheck() const final; @@ -223,7 +224,7 @@ class Checkable : public ObjectImpl private: mutable std::mutex m_CheckableMutex; - bool m_CheckRunning{false}; + Atomic m_CheckRunning{false}; long m_SchedulingOffset; static std::mutex m_StatsMutex; From f049f5258acd974f58ff8cd19e73225b582d8e7c Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Wed, 24 Sep 2025 09:37:48 +0200 Subject: [PATCH 02/10] Checkable: update `next_check` ts in `ExecuteCheck` only if it's needed Since the scheduler accounts for already running checks, we only need to update the `next_check` timestamp in `Checkable::ExecuteCheck()` only where it actually makes sense to do so, and as for local checks this doesn't make sense at all. There only two cases where we need to update the next check beforehand: 1) The execute command event is sent to a connected remote endpoint, so we need to set the next check to a time in the future until we actually receive the check result back from the remote endpoint. However, it must not be too far in the future to avoid that the check is not re-run for too long in case the remote endpoint never responds. 2) The check is a remote check, but either the endpoint is currently syncing replay logs or not connected at all, and we are within the magical 5min cold startup window. In these cases, the check is effectively skipped, and there will be no check result for it coming in, we manually update the next check normally as if the check was executed. In the other cases, either the check is executed locally, which means the `m_RunningCheck` flag already prevents the scheduler from re-running the check, or this is a remote check and the endpoint is not connected, but we are outside the cold startup window, in which case we also don't do anything as we've already called `Checkable::ProcessCheckResult()` with an appropriate error state, which in turn will call `Checkable::UpdateNextCheck()`. --- lib/icinga/checkable-check.cpp | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/lib/icinga/checkable-check.cpp b/lib/icinga/checkable-check.cpp index 34b0005e3e..67d425b24b 100644 --- a/lib/icinga/checkable-check.cpp +++ b/lib/icinga/checkable-check.cpp @@ -576,12 +576,6 @@ void Checkable::ExecuteCheck(const WaitGroup::Ptr& producer) SetLastCheckStarted(Utility::GetTime()); - /* This calls SetNextCheck() which updates the CheckerComponent's idle/pending - * queues and ensures that checks are not fired multiple times. ProcessCheckResult() - * is called too late. See #6421. - */ - UpdateNextCheck(); - bool reachable = IsReachable(); { @@ -643,11 +637,16 @@ void Checkable::ExecuteCheck(const WaitGroup::Ptr& producer) if (listener) listener->SyncSendMessage(endpoint, message); - /* Re-schedule the check so we don't run it again until after we've received - * a check result from the remote instance. The check will be re-scheduled - * using the proper check interval once we've received a check result. + /* + * Let the checker use a dummy next check time until we actually receive the check result from the + * remote endpoint. This should be sufficiently far in the future to avoid excessive CPU load by + * constantly re-running the check, but not too far in the future to avoid that the check is not + * re-run for too long in case the remote endpoint never responds. We add a small grace period + * to the check command timeout to account for network latency and processing time on the remote + * endpoint. So, we only need to silently update this without notifying any listeners, and once + * this function returns, the checker is going access it via GetNextCheck() again. */ - SetNextCheck(Utility::GetTime() + checkTimeout + 30); + SetNextCheck(Utility::GetTime() + checkTimeout + 30, true); /* * Let the user know that there was a problem with the check if @@ -670,6 +669,13 @@ void Checkable::ExecuteCheck(const WaitGroup::Ptr& producer) cr->SetOutput(output); ProcessCheckResult(cr, producer); + } else { + /** + * The endpoint is currently either syncing its state or not connected yet and we are within + * the magical 5min cold startup window. In both cases, we just don't do anything and wait for + * the next check interval to re-try the check again. So, this check is effectively skipped. + */ + UpdateNextCheck(); } /** From fc8de9e087f70bb779bfb2d266d2ab5b146ba1b2 Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Tue, 23 Sep 2025 18:10:57 +0200 Subject: [PATCH 03/10] Introduce `OnRescheduleCheck` signal & make use of it where appropriate This commit introduces a new kinda special `OnRescheduleCheck` signal that is emitted whenever we want to inform the checker to reschedule the checkable at a specific timestamp without actually changing the next check time. Previously, we called `SetNextCheck` with some random timestamp just to enforce the checker to either pick it up immediately or at a specific time. Then at some point in time, subscribing to the `OnNextCheckChanged` signal became effectively unusable for any other purpose than to inform the checker about a new next check time. Thus, it resulted in introducing a new signal that is solely responsible for informing the Icigna DB and IDO about a new next check time in places where calling `SetNextCheck` did make sense. This commit does quite the opposite: it replaces all calls to `SetNextCheck` that were only used to inform the checker about a new next check time wit `OnRescheduleCheck` calls. Only places where we actually wanted to change the next check time still call `SetNextCheck` and thus inform the checker and all other listeners about the new next check time. And as a bonus point, we now got rid of the two object locks for child and parent at the same time. --- lib/checker/checkercomponent.cpp | 13 ++++++++-- lib/checker/checkercomponent.hpp | 2 +- lib/icinga/checkable-check.cpp | 41 ++++++++++++++++++++++++-------- lib/icinga/checkable.cpp | 16 +++++++++++-- lib/icinga/checkable.hpp | 14 ++++++++++- 5 files changed, 70 insertions(+), 16 deletions(-) diff --git a/lib/checker/checkercomponent.cpp b/lib/checker/checkercomponent.cpp index 04583833a0..66abdd944d 100644 --- a/lib/checker/checkercomponent.cpp +++ b/lib/checker/checkercomponent.cpp @@ -55,6 +55,9 @@ void CheckerComponent::OnConfigLoaded() Checkable::OnNextCheckChanged.connect([this](const Checkable::Ptr& checkable, const Value&) { NextCheckChangedHandler(checkable); }); + Checkable::OnRescheduleCheck.connect([this](const Checkable::Ptr& checkable, double nextCheck) { + NextCheckChangedHandler(checkable, nextCheck); + }); } void CheckerComponent::Start(bool runtimeCreated) @@ -341,7 +344,7 @@ CheckableScheduleInfo CheckerComponent::GetCheckableScheduleInfo(const Checkable return csi; } -void CheckerComponent::NextCheckChangedHandler(const Checkable::Ptr& checkable) +void CheckerComponent::NextCheckChangedHandler(const Checkable::Ptr& checkable, double nextCheck) { std::unique_lock lock(m_Mutex); @@ -356,7 +359,13 @@ void CheckerComponent::NextCheckChangedHandler(const Checkable::Ptr& checkable) idx.erase(checkable); - CheckableScheduleInfo csi = GetCheckableScheduleInfo(checkable); + CheckableScheduleInfo csi; + if (nextCheck < 0) { + csi = GetCheckableScheduleInfo(checkable); + } else { + csi.NextCheck = nextCheck; + csi.Object = checkable; + } idx.insert(csi); m_CV.notify_all(); diff --git a/lib/checker/checkercomponent.hpp b/lib/checker/checkercomponent.hpp index 6f7a237159..88cb47797b 100644 --- a/lib/checker/checkercomponent.hpp +++ b/lib/checker/checkercomponent.hpp @@ -93,7 +93,7 @@ class CheckerComponent final : public ObjectImpl void AdjustCheckTimer(); void ObjectHandler(const ConfigObject::Ptr& object); - void NextCheckChangedHandler(const Checkable::Ptr& checkable); + void NextCheckChangedHandler(const Checkable::Ptr& checkable, double nextCheck = -1); void RescheduleCheckTimer(); diff --git a/lib/icinga/checkable-check.cpp b/lib/icinga/checkable-check.cpp index 67d425b24b..7d1fd29f5c 100644 --- a/lib/icinga/checkable-check.cpp +++ b/lib/icinga/checkable-check.cpp @@ -23,6 +23,7 @@ boost::signals2::signal, const MessageOrigin::Ptr&)> Checkable::OnReachabilityChanged; boost::signals2::signal Checkable::OnNotificationsRequested; boost::signals2::signal Checkable::OnNextCheckUpdated; +boost::signals2::signal Checkable::OnRescheduleCheck; Atomic Checkable::CurrentConcurrentChecks (0); @@ -50,7 +51,17 @@ long Checkable::GetSchedulingOffset() const return m_SchedulingOffset; } -void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin) +/** + * Update the next check time of this checkable based on its check interval and last check time. + * + * If onlyReschedule is true, the next check time is not actually updated, but the @c Checkable::OnRescheduleCheck + * signal is emitted with the new calculated next check time. Otherwise, the next check time is updated + * and the @c Checkable::OnNextCheckChanged signal is emitted accordingly. + * + * @param origin The origin of the message triggering this update, can be nullptr. + * @param onlyReschedule If true, only emit @c OnRescheduleCheck without updating the next check time. + */ +void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin, bool onlyReschedule) { double interval; @@ -78,7 +89,14 @@ void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin) << " (" << lastCheck << ") to next check time at " << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", nextCheck) << " (" << nextCheck << ")."; - SetNextCheck(nextCheck, false, origin); + if (onlyReschedule) { + // Someone requested to only reschedule the next check without actually changing it. + // So, just tell the checker about this new timestamp and return. + OnRescheduleCheck(this, nextCheck); + } else { + // Otherwise, set the next check to the newly calculated timestamp and inform all its listeners. + SetNextCheck(nextCheck, false, origin); + } } bool Checkable::HasBeenChecked() const @@ -518,12 +536,15 @@ Checkable::ProcessingResult Checkable::ProcessCheckResult(const CheckResult::Ptr if (recovery) { for (auto& child : children) { if (child->GetProblem() && child->GetEnableActiveChecks()) { - auto nextCheck (now + Utility::Random() % 60); - - ObjectLock oLock (child); - - if (nextCheck < child->GetNextCheck()) { - child->SetNextCheck(nextCheck); + if (auto nextCheck (now + Utility::Random() % 60); nextCheck < child->GetNextCheck()) { + /** + * We only want to enforce the checker to pick this up sooner, and no need to actually change + * the timesatmp. Plus, no other listeners should be informed about this other than the checker, + * so we emit the OnRescheduleCheck signal directly. In case our checker isn't responsible for + * this child object, we've already broadcasted the `CheckResult` event which will cause on the + * responsible node to enter this exact branch and do the rescheduling for its own checker. + */ + OnRescheduleCheck(child, nextCheck); } } } @@ -539,8 +560,8 @@ Checkable::ProcessingResult Checkable::ProcessCheckResult(const CheckResult::Ptr continue; if (parent->GetNextCheck() >= now + parent->GetRetryInterval()) { - ObjectLock olock(parent); - parent->SetNextCheck(now); + // See comment above for children. We want to just enforce an immediate check by our checker. + OnRescheduleCheck(parent, now); } } } diff --git a/lib/icinga/checkable.cpp b/lib/icinga/checkable.cpp index dbe73fe555..3a1262744f 100644 --- a/lib/icinga/checkable.cpp +++ b/lib/icinga/checkable.cpp @@ -88,14 +88,26 @@ void Checkable::Start(bool runtimeCreated) auto cr (GetLastCheckResult()); if (GetLastCheckStarted() > (cr ? cr->GetExecutionEnd() : 0.0)) { - SetNextCheck(GetLastCheckStarted()); + /** + * Each node performs a check immediately after startup before the object authority is determined. + * Meaning, for runtime created objects we should not enter this branch and for non-runtime created + * objects we're going to call `ApiListener::UpdateObjectAuthority()` after all objects have been + * started, so we can safely assume that this timestamp is only relevant for our own checker. + * + * However, until the authority is determined, by default all objects are paused and thus triggering + * `OnRescheduleCheck` would have no effect, as the checker component ignores paused objects. Therefore, + * our only option is to set this dummy ts silently here, so that when the object is unpaused later, + * the checker component will pick it up and schedule the next check properly. + */ + SetNextCheck(GetLastCheckStarted(), true); } } if (GetNextCheck() < now + 60) { double delta = std::min(GetCheckInterval(), 60.0); delta *= (double)std::rand() / RAND_MAX; - SetNextCheck(now + delta); + // We only want to jitter the next check a bit, and inform the scheduler about it, so not setting it directly. + SetNextCheck(now + delta, true); } ObjectImpl::Start(runtimeCreated); diff --git a/lib/icinga/checkable.hpp b/lib/icinga/checkable.hpp index 9141dcf482..3ee4125047 100644 --- a/lib/icinga/checkable.hpp +++ b/lib/icinga/checkable.hpp @@ -104,7 +104,7 @@ class Checkable : public ObjectImpl long GetSchedulingOffset() const; void SetSchedulingOffset(long offset); - void UpdateNextCheck(const MessageOrigin::Ptr& origin = nullptr); + void UpdateNextCheck(const MessageOrigin::Ptr& origin = nullptr, bool onlyReschedule = false); static String StateTypeToString(StateType type); @@ -148,6 +148,18 @@ class Checkable : public ObjectImpl static boost::signals2::signal OnAcknowledgementCleared; static boost::signals2::signal OnFlappingChange; static boost::signals2::signal OnNextCheckUpdated; + /** + * Think again! Are you really sure you want to subscribe to this signal? + * + * This signal is a very special and noisy one. It is emitted whenever someone wants to enforce the + * LOCAL scheduler to reschedule the next check of a checkable at the given timestamp. This can be + * due to a number of reasons, for example: Icinga 2 was interrupted while a check was being executed, + * and the checkable needs to be rescheduled on startup; or a parent host changed its state and all its + * children need to be rescheduled to reflect the new reachability state. There are other cases as well, + * but the point is: this signal is meant to only be used by the local @c CheckerComponent to update its + * internal queues. If you want to use this for something else, think twice!! + */ + static boost::signals2::signal OnRescheduleCheck; static boost::signals2::signal OnEventCommandExecuted; static Atomic CurrentConcurrentChecks; From b1e3a8a4366b43ec5783f5bf31cbcd30d3524e5e Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Tue, 23 Sep 2025 18:47:20 +0200 Subject: [PATCH 04/10] Let Icinga DB & IDO subscribe to `OnNextCheckChanged` signal It also removes the extra `SendNextUpdate()` call from the `NewCheckResultHandler` handler in Icinga DB, since it's subscribed to the `NextCheckChanged` event anyway and that event is always emitted before the `NewCheckResult` event gets triggered. This call became redundant. --- lib/db_ido/dbevents.cpp | 2 +- lib/icingadb/icingadb-objects.cpp | 7 +++---- lib/icingadb/icingadb.hpp | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/lib/db_ido/dbevents.cpp b/lib/db_ido/dbevents.cpp index cf80a0226d..11a83d2c12 100644 --- a/lib/db_ido/dbevents.cpp +++ b/lib/db_ido/dbevents.cpp @@ -40,7 +40,7 @@ void DbEvents::StaticInitialize() DbEvents::RemoveAcknowledgement(checkable); }); - Checkable::OnNextCheckUpdated.connect([](const Checkable::Ptr& checkable) { NextCheckUpdatedHandler(checkable); }); + Checkable::OnNextCheckChanged.connect([](const Checkable::Ptr& checkable, const Value&) { NextCheckUpdatedHandler(checkable); }); Checkable::OnFlappingChanged.connect([](const Checkable::Ptr& checkable, const Value&) { FlappingChangedHandler(checkable); }); Checkable::OnNotificationSentToAllUsers.connect([](const Notification::Ptr& notification, const Checkable::Ptr& checkable, const std::set&, const NotificationType&, const CheckResult::Ptr&, const String&, const String&, diff --git a/lib/icingadb/icingadb-objects.cpp b/lib/icingadb/icingadb-objects.cpp index a8c2ed0b2c..390d30063f 100644 --- a/lib/icingadb/icingadb-objects.cpp +++ b/lib/icingadb/icingadb-objects.cpp @@ -135,8 +135,8 @@ void IcingaDB::ConfigStaticInitialize() IcingaDB::NewCheckResultHandler(checkable); }); - Checkable::OnNextCheckUpdated.connect([](const Checkable::Ptr& checkable) { - IcingaDB::NextCheckUpdatedHandler(checkable); + Checkable::OnNextCheckChanged.connect([](const Checkable::Ptr& checkable, const Value&) { + IcingaDB::NextCheckChangedHandler(checkable); }); Service::OnHostProblemChanged.connect([](const Service::Ptr& service, const CheckResult::Ptr&, const MessageOrigin::Ptr&) { @@ -3206,11 +3206,10 @@ void IcingaDB::NewCheckResultHandler(const Checkable::Ptr& checkable) { for (auto& rw : ConfigType::GetObjectsByType()) { rw->UpdateState(checkable, StateUpdate::Volatile); - rw->SendNextUpdate(checkable); } } -void IcingaDB::NextCheckUpdatedHandler(const Checkable::Ptr& checkable) +void IcingaDB::NextCheckChangedHandler(const Checkable::Ptr& checkable) { for (auto& rw : ConfigType::GetObjectsByType()) { rw->UpdateState(checkable, StateUpdate::Volatile); diff --git a/lib/icingadb/icingadb.hpp b/lib/icingadb/icingadb.hpp index 33b6414c89..778a13b23c 100644 --- a/lib/icingadb/icingadb.hpp +++ b/lib/icingadb/icingadb.hpp @@ -202,7 +202,7 @@ class IcingaDB : public ObjectImpl static void CommentRemovedHandler(const Comment::Ptr& comment); static void FlappingChangeHandler(const Checkable::Ptr& checkable, double changeTime); static void NewCheckResultHandler(const Checkable::Ptr& checkable); - static void NextCheckUpdatedHandler(const Checkable::Ptr& checkable); + static void NextCheckChangedHandler(const Checkable::Ptr& checkable); static void DependencyGroupChildRegisteredHandler(const Checkable::Ptr& child, const DependencyGroup::Ptr& dependencyGroup); static void DependencyGroupChildRemovedHandler(const DependencyGroup::Ptr& dependencyGroup, const std::vector& dependencies, bool removeGroup); static void HostProblemChangedHandler(const Service::Ptr& service); From a58a69f7155fbc16486c2fc619d04cab860a76a1 Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Tue, 23 Sep 2025 18:56:59 +0200 Subject: [PATCH 05/10] Drop the now superfluous `OnNextCheckUpdated` signal --- lib/checker/checkercomponent.cpp | 8 -------- lib/icinga/apiactions.cpp | 3 --- lib/icinga/checkable-check.cpp | 1 - lib/icinga/checkable.hpp | 1 - lib/icinga/externalcommandprocessor.cpp | 18 ------------------ 5 files changed, 31 deletions(-) diff --git a/lib/checker/checkercomponent.cpp b/lib/checker/checkercomponent.cpp index 66abdd944d..525d15705a 100644 --- a/lib/checker/checkercomponent.cpp +++ b/lib/checker/checkercomponent.cpp @@ -138,7 +138,6 @@ void CheckerComponent::CheckThreadProc() bool forced = checkable->GetForceNextCheck(); bool check = true; - bool notifyNextCheck = false; double nextCheck = -1; if (!forced) { @@ -147,7 +146,6 @@ void CheckerComponent::CheckThreadProc() << "Skipping check for object '" << checkable->GetName() << "': Dependency failed."; check = false; - notifyNextCheck = true; } Host::Ptr host; @@ -184,7 +182,6 @@ void CheckerComponent::CheckThreadProc() << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", nextCheck); check = false; - notifyNextCheck = true; } } } @@ -203,11 +200,6 @@ void CheckerComponent::CheckThreadProc() checkable->UpdateNextCheck(); } - if (notifyNextCheck) { - // Trigger update event for Icinga DB - Checkable::OnNextCheckUpdated(checkable); - } - lock.lock(); continue; diff --git a/lib/icinga/apiactions.cpp b/lib/icinga/apiactions.cpp index e7c2e35109..3586bf2ecd 100644 --- a/lib/icinga/apiactions.cpp +++ b/lib/icinga/apiactions.cpp @@ -159,9 +159,6 @@ Dictionary::Ptr ApiActions::RescheduleCheck(const ConfigObject::Ptr& object, checkable->SetNextCheck(nextCheck); - /* trigger update event for DB IDO */ - Checkable::OnNextCheckUpdated(checkable); - return ApiActions::CreateResult(200, "Successfully rescheduled check for object '" + checkable->GetName() + "'."); } diff --git a/lib/icinga/checkable-check.cpp b/lib/icinga/checkable-check.cpp index 7d1fd29f5c..963dde38dc 100644 --- a/lib/icinga/checkable-check.cpp +++ b/lib/icinga/checkable-check.cpp @@ -22,7 +22,6 @@ boost::signals2::signal Checkable::OnStateChange; boost::signals2::signal, const MessageOrigin::Ptr&)> Checkable::OnReachabilityChanged; boost::signals2::signal Checkable::OnNotificationsRequested; -boost::signals2::signal Checkable::OnNextCheckUpdated; boost::signals2::signal Checkable::OnRescheduleCheck; Atomic Checkable::CurrentConcurrentChecks (0); diff --git a/lib/icinga/checkable.hpp b/lib/icinga/checkable.hpp index 3ee4125047..b4ae94f1ee 100644 --- a/lib/icinga/checkable.hpp +++ b/lib/icinga/checkable.hpp @@ -147,7 +147,6 @@ class Checkable : public ObjectImpl bool, bool, double, double, const MessageOrigin::Ptr&)> OnAcknowledgementSet; static boost::signals2::signal OnAcknowledgementCleared; static boost::signals2::signal OnFlappingChange; - static boost::signals2::signal OnNextCheckUpdated; /** * Think again! Are you really sure you want to subscribe to this signal? * diff --git a/lib/icinga/externalcommandprocessor.cpp b/lib/icinga/externalcommandprocessor.cpp index 33831a4910..b61f640fd2 100644 --- a/lib/icinga/externalcommandprocessor.cpp +++ b/lib/icinga/externalcommandprocessor.cpp @@ -392,9 +392,6 @@ void ExternalCommandProcessor::ScheduleHostCheck(double, const std::vectorSetNextCheck(planned_check); - - /* trigger update event for DB IDO */ - Checkable::OnNextCheckUpdated(host); } void ExternalCommandProcessor::ScheduleForcedHostCheck(double, const std::vector& arguments) @@ -409,9 +406,6 @@ void ExternalCommandProcessor::ScheduleForcedHostCheck(double, const std::vector host->SetForceNextCheck(true); host->SetNextCheck(Convert::ToDouble(arguments[1])); - - /* trigger update event for DB IDO */ - Checkable::OnNextCheckUpdated(host); } void ExternalCommandProcessor::ScheduleSvcCheck(double, const std::vector& arguments) @@ -437,9 +431,6 @@ void ExternalCommandProcessor::ScheduleSvcCheck(double, const std::vectorSetNextCheck(planned_check); - - /* trigger update event for DB IDO */ - Checkable::OnNextCheckUpdated(service); } void ExternalCommandProcessor::ScheduleForcedSvcCheck(double, const std::vector& arguments) @@ -454,9 +445,6 @@ void ExternalCommandProcessor::ScheduleForcedSvcCheck(double, const std::vector< service->SetForceNextCheck(true); service->SetNextCheck(Convert::ToDouble(arguments[2])); - - /* trigger update event for DB IDO */ - Checkable::OnNextCheckUpdated(service); } void ExternalCommandProcessor::EnableHostCheck(double, const std::vector& arguments) @@ -538,9 +526,6 @@ void ExternalCommandProcessor::ScheduleForcedHostSvcChecks(double, const std::ve service->SetNextCheck(planned_check); service->SetForceNextCheck(true); - - /* trigger update event for DB IDO */ - Checkable::OnNextCheckUpdated(service); } } @@ -568,9 +553,6 @@ void ExternalCommandProcessor::ScheduleHostSvcChecks(double, const std::vectorGetName() << "'"; service->SetNextCheck(planned_check); - - /* trigger update event for DB IDO */ - Checkable::OnNextCheckUpdated(service); } } From 9a5fd4bf96bc543a00185a6e2e34e4848dc6e9a0 Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Tue, 23 Sep 2025 18:57:41 +0200 Subject: [PATCH 06/10] ClusterEvents: add special special handling for `SetNextCheck` events --- lib/icinga/clusterevents.cpp | 27 ++++++++++++++++++++++++--- lib/icinga/clusterevents.hpp | 2 +- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/lib/icinga/clusterevents.cpp b/lib/icinga/clusterevents.cpp index d76a9351aa..e0ef3c00c8 100644 --- a/lib/icinga/clusterevents.cpp +++ b/lib/icinga/clusterevents.cpp @@ -46,7 +46,9 @@ REGISTER_APIFUNCTION(SetRemovalInfo, event, &ClusterEvents::SetRemovalInfoAPIHan void ClusterEvents::StaticInitialize() { Checkable::OnNewCheckResult.connect(&ClusterEvents::CheckResultHandler); - Checkable::OnNextCheckChanged.connect(&ClusterEvents::NextCheckChangedHandler); + Checkable::OnNextCheckChanged.connect([](const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin) { + NextCheckChangedHandler(checkable, origin); + }); Checkable::OnLastCheckStartedChanged.connect(&ClusterEvents::LastCheckStartedChangedHandler); Checkable::OnStateBeforeSuppressionChanged.connect(&ClusterEvents::StateBeforeSuppressionChangedHandler); Checkable::OnSuppressedNotificationsChanged.connect(&ClusterEvents::SuppressedNotificationsChangedHandler); @@ -183,7 +185,7 @@ Value ClusterEvents::CheckResultAPIHandler(const MessageOrigin::Ptr& origin, con return Empty; } -void ClusterEvents::NextCheckChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin) +void ClusterEvents::NextCheckChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin, bool isFromOldClient) { ApiListener::Ptr listener = ApiListener::GetInstance(); @@ -199,6 +201,7 @@ void ClusterEvents::NextCheckChangedHandler(const Checkable::Ptr& checkable, con if (service) params->Set("service", service->GetShortName()); params->Set("next_check", checkable->GetNextCheck()); + params->Set("origin_client_old", isFromOldClient); Dictionary::Ptr message = new Dictionary(); message->Set("jsonrpc", "2.0"); @@ -245,7 +248,25 @@ Value ClusterEvents::NextCheckChangedAPIHandler(const MessageOrigin::Ptr& origin if (nextCheck < Application::GetStartTime() + 60) return Empty; - checkable->SetNextCheck(params->Get("next_check"), false, origin); + bool isOriginClientOld = params->Get("origin_client_old").ToBool(); + if (isOriginClientOld || endpoint->GetIcingaVersion() < 21600) { // TODO: Exact required version?? + // All older Icinga 2 versions trigger the `event::SetNextCheck` event endlessly, so we can't let the + // `Checkable::OnNextCheckChanged` know about this change, as it would cause too much noise in Icinga DB + // and the like. So, we just update the timestamp silently and manually call `Checkable::OnRescheduleCheck` + // to inform the checker about this change. + checkable->SetNextCheck(params->Get("next_check"), true, origin); + Checkable::OnRescheduleCheck(checkable, nextCheck); + // In case this message has to be relayed to other cluster nodes we directly call the `NextCheckChangedHandler`. + // The `isOriginClientOld` flag is necessary e.g. in a 3-level cluster setup where a very old agent sends + // the message to its satellite that then relays it to the master. The master then needs to know that this + // event originates from an old node and thus has to be treated specially. + NextCheckChangedHandler(checkable, origin, true); + } else { + // Peer is running the expected Icinga 2 version, so we can just do the normal flow. + // The `Checkable::OnNextCheckChanged` signal will be emitted as usual and all listeners + // will be informed about this change. + checkable->SetNextCheck(params->Get("next_check"), false, origin); + } return Empty; } diff --git a/lib/icinga/clusterevents.hpp b/lib/icinga/clusterevents.hpp index 8daf86ab3b..dd01b3a552 100644 --- a/lib/icinga/clusterevents.hpp +++ b/lib/icinga/clusterevents.hpp @@ -23,7 +23,7 @@ class ClusterEvents static void CheckResultHandler(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr, const MessageOrigin::Ptr& origin); static Value CheckResultAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params); - static void NextCheckChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin); + static void NextCheckChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin, bool isFromOldClient = false); static Value NextCheckChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params); static void LastCheckStartedChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin); From 5af6b3b2f5e66bb4bd39a929f0a4c5cac5b4adce Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Thu, 25 Sep 2025 09:34:28 +0200 Subject: [PATCH 07/10] tests: fix min severity doesn't get updated If the logger is started with `Activate()` before `SetActive()`, it won't log anything, as the logger updates the "min severity" value of loggers only when starting them, and if they're not active at that point, they will just be ignored, so the min severity remains at info. --- test/base-testloggerfixture.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/base-testloggerfixture.hpp b/test/base-testloggerfixture.hpp index 69c073b02a..c5a4f63e24 100644 --- a/test/base-testloggerfixture.hpp +++ b/test/base-testloggerfixture.hpp @@ -88,8 +88,8 @@ struct TestLoggerFixture TestLoggerFixture() { testLogger->SetSeverity(testLogger->SeverityToString(LogDebug)); - testLogger->Activate(true); testLogger->SetActive(true); + testLogger->Activate(true); } ~TestLoggerFixture() From 62a43a6d4ae3b386fa9b711d434f620f74898293 Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Thu, 25 Sep 2025 09:46:28 +0200 Subject: [PATCH 08/10] tests: raise `Concurrency` config in global app fixture --- test/icingaapplication-fixture.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/icingaapplication-fixture.cpp b/test/icingaapplication-fixture.cpp index 80fa4bfd8e..32517ab128 100644 --- a/test/icingaapplication-fixture.cpp +++ b/test/icingaapplication-fixture.cpp @@ -8,6 +8,12 @@ static bool IcingaInitialized = false; IcingaApplicationFixture::IcingaApplicationFixture() { + // This limits the number of threads in the thread pool, but does not limit the number of concurrent checks. + // By default, we'll have only 2 threads in the pool, which is too low for some heavy checker tests. So, we + // set the concurrency to the number of hardware threads available on the system and the global thread pool + // size will be the double of that (see threadpool.cpp). + Configuration::Concurrency = std::thread::hardware_concurrency(); + if (!IcingaInitialized) InitIcingaApplication(); } From 39c1d10583bf258245a158e1f3434e52f4619401 Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Thu, 25 Sep 2025 09:49:17 +0200 Subject: [PATCH 09/10] checker: make result timer interval configurable for testing --- lib/checker/checkercomponent.cpp | 17 ++++++++++++++++- lib/checker/checkercomponent.hpp | 8 ++++---- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/lib/checker/checkercomponent.cpp b/lib/checker/checkercomponent.cpp index 525d15705a..d400b2b10c 100644 --- a/lib/checker/checkercomponent.cpp +++ b/lib/checker/checkercomponent.cpp @@ -71,7 +71,7 @@ void CheckerComponent::Start(bool runtimeCreated) m_Thread = std::thread([this]() { CheckThreadProc(); }); m_ResultTimer = Timer::Create(); - m_ResultTimer->SetInterval(5); + m_ResultTimer->SetInterval(m_ResultTimerInterval); m_ResultTimer->OnTimerExpired.connect([this](const Timer * const&) { ResultTimerHandler(); }); m_ResultTimer->Start(); } @@ -376,3 +376,18 @@ unsigned long CheckerComponent::GetPendingCheckables() return m_PendingCheckables.size(); } + +/** + * Sets the interval in seconds for the result timer. + * + * The result timer periodically logs the number of pending and idle checkables + * as well as the checks per second rate. The default interval is 5 seconds. + * + * Note, this method must be called before the component is started to have an effect on the timer. + * + * @param interval Interval in seconds for the result timer. + */ +void CheckerComponent::SetResultTimerInterval(double interval) +{ + m_ResultTimerInterval = interval; +} diff --git a/lib/checker/checkercomponent.hpp b/lib/checker/checkercomponent.hpp index 88cb47797b..dfd34f540e 100644 --- a/lib/checker/checkercomponent.hpp +++ b/lib/checker/checkercomponent.hpp @@ -73,12 +73,16 @@ class CheckerComponent final : public ObjectImpl unsigned long GetIdleCheckables(); unsigned long GetPendingCheckables(); + void SetResultTimerInterval(double interval); + private: std::mutex m_Mutex; std::condition_variable m_CV; bool m_Stopped{false}; std::thread m_Thread; + double m_ResultTimerInterval{5.0}; // Interval in seconds for the result timer. + CheckableSet m_IdleCheckables; CheckableSet m_PendingCheckables; @@ -90,13 +94,9 @@ class CheckerComponent final : public ObjectImpl void ExecuteCheckHelper(const Checkable::Ptr& checkable); - void AdjustCheckTimer(); - void ObjectHandler(const ConfigObject::Ptr& object); void NextCheckChangedHandler(const Checkable::Ptr& checkable, double nextCheck = -1); - void RescheduleCheckTimer(); - static CheckableScheduleInfo GetCheckableScheduleInfo(const Checkable::Ptr& checkable); }; From 5f3b7d2337d68d9b0340902d172d863deb0775d1 Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Thu, 25 Sep 2025 09:53:47 +0200 Subject: [PATCH 10/10] test: add basic checker scheduling test cases --- test/CMakeLists.txt | 9 + test/base-testloggerfixture.hpp | 26 +++ test/checker-fixture.cpp | 201 +++++++++++++++++++++ test/checker-fixture.hpp | 127 +++++++++++++ test/checker.cpp | 311 ++++++++++++++++++++++++++++++++ 5 files changed, 674 insertions(+) create mode 100644 test/checker-fixture.cpp create mode 100644 test/checker-fixture.hpp create mode 100644 test/checker.cpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index cb815da737..2d93afc565 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -129,6 +129,15 @@ set(base_test_SOURCES $ ) +if (ICINGA2_WITH_CHECKER) + list(APPEND base_test_SOURCES + checker.cpp + checker-fixture.cpp + checker-fixture.hpp + $ + ) +endif() + if(ICINGA2_UNITY_BUILD) mkunity_target(base test base_test_SOURCES) endif() diff --git a/test/base-testloggerfixture.hpp b/test/base-testloggerfixture.hpp index c5a4f63e24..a3571f8f9c 100644 --- a/test/base-testloggerfixture.hpp +++ b/test/base-testloggerfixture.hpp @@ -52,6 +52,27 @@ class TestLogger : public Logger return ret; } + /** + * Counts the number of log entries that match the given regex pattern. + * + * This only counts existing log entries, it does not wait for new ones to arrive. + * + * @param pattern The regex pattern the log message needs to match + * + * @return The number of log entries that match the given pattern + */ + auto CountExpectedLogPattern(const std::string& pattern) + { + std::lock_guard lock(m_Mutex); + int count = 0; + for (const auto& logEntry : m_LogEntries) { + if (boost::regex_match(logEntry.Message.GetData(), boost::regex(pattern))) { + ++count; + } + } + return count; + } + private: void ProcessLogEntry(const LogEntry& entry) override { @@ -93,6 +114,11 @@ struct TestLoggerFixture } ~TestLoggerFixture() + { + DeactivateLogger(); + } + + void DeactivateLogger() const { testLogger->SetActive(false); testLogger->Deactivate(true); diff --git a/test/checker-fixture.cpp b/test/checker-fixture.cpp new file mode 100644 index 0000000000..7e5de60f57 --- /dev/null +++ b/test/checker-fixture.cpp @@ -0,0 +1,201 @@ +/* Icinga 2 | (c) 2025 Icinga GmbH | GPLv2+ */ + +#include "base/utility.hpp" +#include "config/configcompiler.hpp" +#include "remote/apilistener.hpp" +#include "test/checker-fixture.hpp" +#include "test/icingaapplication-fixture.hpp" + +using namespace icinga; + +CheckerFixture::CheckerFixture() +{ + checker = new CheckerComponent; + checker->SetResultTimerInterval(.4); // Speed up result timer for tests + checker->SetName("randomizer", true); + checker->Register(); + checker->OnConfigLoaded(); + checker->PreActivate(); + checker->Activate(); + + // Manually building and registering the command won't work here as we need a real callable + // function that produces cr and calls ProcessCheckResult on the checkable. So we use a small + // config snippet that imports the random and sleep check commands registered by the "methods-itl.cpp" + // file in the methods lib. + ConfigItem::RunWithActivationContext( + new Function{ + "checker-fixture", + [] { + std::unique_ptr expression = ConfigCompiler::CompileText( + "", + R"CONFIG( +object CheckCommand "random" { + import "random-check-command" +} + +object CheckCommand "sleep" { + import "sleep-check-command" +} +)CONFIG" + ); + BOOST_REQUIRE(expression); + ScriptFrame frame(true); + BOOST_CHECK_NO_THROW(expression->Evaluate(frame)); + } + } + ); + + Checkable::OnNewCheckResult.connect( + [this](const Checkable::Ptr&, const CheckResult::Ptr& cr, const MessageOrigin::Ptr&) { + ++resultCount; + lastResult.store(cr); + } + ); + // Track the next check times of the checkables, so we can verify that they are set to the expected + // value. The map is populated with the expected checkable names by the RegisterChecable() function. + // So, we can safely modify the map from within this signal handler without further locking, since there + // won't be any concurrent access to the same keys. + Checkable::OnNextCheckChanged.connect([this](const Checkable::Ptr& checkable, const Value&) { + assert(nextCheckTimes.find(checkable->GetName()) != nextCheckTimes.end()); + nextCheckTimes[checkable->GetName()] = checkable->GetNextCheck(); + }); +} + +CheckerFixture::~CheckerFixture() +{ + checker->Deactivate(); +} + +void CheckerFixture::RegisterCheckablesRandom(int count, bool disableChecks, bool unreachable) +{ + for (int i = 1; i <= count; ++i) { + RegisterCheckable("host-" + std::to_string(i), "random", "", "", disableChecks, unreachable); + } +} + +void CheckerFixture::RegisterCheckablesSleep(int count, double sleepTime, bool disableChecks, bool unreachable) +{ + for (int i = 1; i <= count; ++i) { + auto h = RegisterCheckable("host-" + std::to_string(i), "sleep", "", "", disableChecks, unreachable); + h->SetVars(new Dictionary{{"sleep_time", sleepTime}}); + } +} + +void CheckerFixture::RegisterRemoteChecks(int count, bool isConnected, bool isSyncingReplayLogs) +{ + RegisterEndpoint("remote-checker", isConnected, isSyncingReplayLogs); + for (int i = 1; i <= count; ++i) { + auto h = RegisterCheckable("host-"+std::to_string(i), "random", "", "remote-checker"); + Checkable::OnRescheduleCheck(h, Utility::GetTime()); // Force initial scheduling + } +} + +Host::Ptr CheckerFixture::RegisterCheckable( + std::string name, + std::string cmd, + std::string period, + std::string endpoint, + bool disableChecks, + bool unreachable +) +{ + Host::Ptr host = new Host; + host->SetName(std::move(name), true); + host->SetCheckCommandRaw(std::move(cmd), true); + host->SetCheckInterval(checkInterval, true); + host->SetRetryInterval(retryInterval, true); + host->SetHAMode(HARunEverywhere, true); // Disable HA for tests + host->SetEnableActiveChecks(!disableChecks, true); + host->SetCheckPeriodRaw(std::move(period), true); + host->SetZoneName(endpoint, true); + host->SetCommandEndpointRaw(std::move(endpoint), true); + host->SetCheckTimeout(0, true); + host->Register(); + host->OnAllConfigLoaded(); + + nextCheckTimes[host->GetName()] = 0.0; // Initialize next check time tracking + + if (unreachable) { + Host::Ptr parent = new Host; + parent->SetName(Utility::NewUniqueID(), true); + parent->SetStateRaw(ServiceCritical, true); + parent->SetStateType(StateTypeHard, true); + parent->SetLastCheckResult(new CheckResult, true); + parent->Register(); + + Dependency::Ptr dep = new Dependency; + dep->SetName(Utility::NewUniqueID(), true); + dep->SetStateFilter(StateFilterUp, true); + dep->SetDisableChecks(true, true); + dep->SetParent(parent); + dep->SetChild(host); + dep->Register(); + + host->AddDependency(dep); + } + + host->PreActivate(); + host->Activate(); + return host; +} + +void CheckerFixture::SleepFor(double seconds, bool deactivateLogger) const +{ + Utility::Sleep(seconds); + Checkable::OnNextCheckChanged.disconnect_all_slots(); + Checkable::OnNewCheckResult.disconnect_all_slots(); + if (deactivateLogger) { + DeactivateLogger(); + } +} + +Endpoint::Ptr CheckerFixture::RegisterEndpoint(std::string name, bool isConnected, bool isSyncingReplayLogs) +{ + auto makeEndpoint = [](const std::string& name, bool syncing) { + Endpoint::Ptr remote = new Endpoint; + remote->SetName(name, true); + if (syncing) { + remote->SetSyncing(true); + } + remote->Register(); + remote->PreActivate(); + return remote; + }; + + Endpoint::Ptr remote = makeEndpoint(name, isSyncingReplayLogs); + Endpoint::Ptr local = makeEndpoint("local-tester", false); + + Zone::Ptr zone = new Zone; + zone->SetName(remote->GetName(), true); + zone->SetEndpointsRaw(new Array{{remote->GetName(), local->GetName()}}, true); + zone->Register(); + zone->OnAllConfigLoaded(); + zone->PreActivate(); + zone->Activate(); + + ApiListener::Ptr listener = new ApiListener; + listener->SetIdentity(local->GetName(), true); + listener->SetName(local->GetName(), true); + listener->Register(); + try { + listener->OnAllConfigLoaded(); // Initialize the m_LocalEndpoint of the listener! + + // May throw due to various reasons, but we only care that the m_Instance singleton + // is set which it will be if no other ApiListener is registered yet. + listener->OnConfigLoaded(); + } catch (const std::exception& ex) { + BOOST_TEST_MESSAGE("Exception during ApiListener::OnConfigLoaded: " << DiagnosticInformation(ex)); + } + + if (isConnected) { + JsonRpcConnection::Ptr client = new JsonRpcConnection( + new StoppableWaitGroup, + "anonymous", + false, + nullptr, + RoleClient + ); + remote->AddClient(client); + } + return remote; +} diff --git a/test/checker-fixture.hpp b/test/checker-fixture.hpp new file mode 100644 index 0000000000..963bcde39d --- /dev/null +++ b/test/checker-fixture.hpp @@ -0,0 +1,127 @@ +/* Icinga 2 | (c) 2025 Icinga GmbH | GPLv2+ */ + +#pragma once + +#include "base/atomic.hpp" +#include "checker/checkercomponent.hpp" +#include "icinga/checkcommand.hpp" +#include "remote/endpoint.hpp" +#include "test/base-testloggerfixture.hpp" + +namespace icinga { + +/** + * Test fixture for tests involving the @c CheckerComponent. + * + * This fixture sets up a CheckerComponent instance and provides utility functions to register + * checkable objects. It is derived from @c TestLoggerFixture to capture log output during tests, + * so tests can verify that expected log messages are produced. The fixture also connects to the + * @c Checkable::OnNewCheckResult signal to count the number of check results produced during tests. + */ +struct CheckerFixture : TestLoggerFixture +{ + CheckerFixture(); + + ~CheckerFixture(); + + /** + * Registers a fully configured set of checkable hosts that execute the "random" check command. + * + * Each host is configured with a random check command, check interval, and retry interval. + * If @c unreachable is true, each host is made unreachable by adding a dependency on a parent + * host that is in a critical state. This prevents the checker from executing checks for the + * child hosts. The check and retry intervals are kept low to allow for quick test execution, + * but they can be adjusted via the @c interval and @c retry parameters. + * + * @param count Number of checkable hosts to register. + * @param disableChecks If true, disables active checks for each host. + * @param unreachable If true, makes each host unreachable via a dependency. + */ + void RegisterCheckablesRandom(int count, bool disableChecks = false, bool unreachable = false); + + /** + * Registers a fully configured set of checkable hosts that execute the "sleep" command. + * + * Each host is configured with a sleep check command that sleeps for the specified duration. + * The check and retry intervals can be adjusted via the @c checkInterval and @c retryInterval + * member variables of the fixture. If @c unreachable is true, each host is made unreachable by + * adding a dependency on a parent host that is in a critical state. This prevents the checker + * from executing checks for the child hosts. + * + * @param count Number of checkable hosts to register. + * @param sleepTime Duration (in seconds) that the sleep command should sleep. Defaults to 1.0 second. + * @param disableChecks If true, disables active checks for each host. + * @param unreachable If true, makes each host unreachable via a dependency. + */ + void RegisterCheckablesSleep( + int count, + double sleepTime = 1.0, + bool disableChecks = false, + bool unreachable = false + ); + + /** + * Registers a remote endpoint and a set of checkable hosts assigned to that endpoint. + * + * The remote endpoint can be configured to appear connected or disconnected, and can also + * be set to be syncing replay as needed for tests involving remote checks. + * + * @param count Number of checkable hosts to register. + * @param isConnected If true, the remote endpoint is marked as connected. + * @param isSyncingReplayLogs If true, the remote endpoint is marked as syncing replay logs. + */ + void RegisterRemoteChecks(int count, bool isConnected = false, bool isSyncingReplayLogs = false); + + Host::Ptr RegisterCheckable( + std::string name, + std::string cmd, + std::string period = "", + std::string endpoint = "", + bool disableChecks = false, + bool unreachable = false + ); + + /** + * Sleeps for the specified number of seconds, then immediately disconnects all signal handlers + * connected to @c Checkable::OnNextCheckChanged and @c Checkable::OnNewCheckResult. + * + * This is useful in tests to allow some time for checks to be executed and results to be processed, + * while ensuring that no further signal handlers are called after the sleep period. This helps to avoid + * unexpected side effects in tests, since the checker continues to run till the fixture is destroyed. + * + * @param seconds Number of seconds to sleep. + * @param deactivateLogger If true, deactivates the test logger after sleeping to prevent further log capture. + */ + void SleepFor(double seconds, bool deactivateLogger = false) const; + + /** + * Registers a remote endpoint with the specified name and connection/syncing state and returns it. + * + * @param name Name of the endpoint to register. + * @param isConnected If true, the endpoint is marked as connected. + * @param isSyncingReplayLogs If true, the endpoint is marked as syncing replay logs. + * + * @return The registered endpoint instance. + */ + static Endpoint::Ptr RegisterEndpoint(std::string name, bool isConnected = false, bool isSyncingReplayLogs = false); + + /** + * resultCount tracks the number of check results produced by the checker. + * + * This is used in tests to verify that checks are actually being executed and results processed. + * It is incremented from within the OnNewCheckResult signal handler, thus must be atomic. + */ + Atomic resultCount{0}; + AtomicOrLocked lastResult; // Stores the last check result received, for inspection in tests. + /** + * nextCheckTimes tracks the next scheduled check time for each registered checkable host. + * This might not be used in all tests, but is available for tests that need to verify the exact + * next check timestamp set by the checker. + */ + std::map nextCheckTimes; + double checkInterval{.1}; // Interval in seconds between regular checks for each checkable. + double retryInterval{.1}; // Interval in seconds between retry checks for each checkable. + CheckerComponent::Ptr checker; +}; + +} // namespace icinga diff --git a/test/checker.cpp b/test/checker.cpp new file mode 100644 index 0000000000..e0e0a0585c --- /dev/null +++ b/test/checker.cpp @@ -0,0 +1,311 @@ +/* Icinga 2 | (c) 2025 Icinga GmbH | GPLv2+ */ + +#include "test/checker-fixture.hpp" +#include "base/scriptglobal.hpp" +#include "icinga/host.hpp" +#include "icinga/dependency.hpp" +#include "icinga/legacytimeperiod.hpp" +#include + +using namespace icinga; + +BOOST_FIXTURE_TEST_SUITE(checker, CheckerFixture, *boost::unit_test::label("checker")) + +BOOST_AUTO_TEST_CASE(single_check) +{ + // For a single checkable, there shouldn't be any concurrent checks that trigger this event, + // so we can safely use boost assertion macros within the event handler. + Checkable::OnNextCheckChanged.connect([this](const Checkable::Ptr& checkable, const Value&) { + BOOST_CHECK_EQUAL(checkable->GetName(), "host-1"); + BOOST_CHECK_LE(checkable->GetNextCheck(), checkable->GetLastCheck() + checkInterval + .5); + }); + + RegisterCheckablesRandom(1); + SleepFor(.4, true); + + BOOST_CHECK_EQUAL(4, testLogger->CountExpectedLogPattern("Executing check for 'host-1'")); + BOOST_CHECK_EQUAL(4, testLogger->CountExpectedLogPattern("Check finished for object 'host-1'")); + BOOST_CHECK_EQUAL(4, resultCount); +} + +BOOST_AUTO_TEST_CASE(multiple_checks) +{ + Checkable::OnNextCheckChanged.connect([this](const Checkable::Ptr& checkable, const Value&) { + BOOST_CHECK_LE(checkable->GetNextCheck(), checkable->GetLastCheck() + checkInterval + .5); + }); + + RegisterCheckablesRandom(8); + SleepFor(.3, true); + + BOOST_CHECK(ExpectLogPattern("Executing check for .*")); + BOOST_CHECK(ExpectLogPattern("Check finished for object .*")); + auto executedC = testLogger->CountExpectedLogPattern("Executing check for .*"); + auto finishedC = testLogger->CountExpectedLogPattern("Check finished for object .*"); + BOOST_CHECK_EQUAL(executedC, finishedC); + // With 8 checkables and a check interval of 0.1s, we expect that each checkable is checked at least + // twice, but some of them possibly up to 3 times, depending on the timing and OS scheduling behavior. + BOOST_CHECK_MESSAGE(22 <= resultCount && resultCount <= 25, "got=" << resultCount); +} + +BOOST_AUTO_TEST_CASE(disabled_checks) +{ + RegisterCheckablesRandom(4, true); + SleepFor(.3, true); + + auto failedC = testLogger->CountExpectedLogPattern("Skipping check for host .*: active host checks are disabled"); + BOOST_CHECK_MESSAGE(10 <= failedC && failedC <= 13, "got=" << failedC); + + auto rescheduleC = testLogger->CountExpectedLogPattern("Checks for checkable .* are disabled. Rescheduling check."); + BOOST_CHECK_MESSAGE(10 <= rescheduleC && rescheduleC <= 13, "got=" << rescheduleC); + BOOST_CHECK_EQUAL(failedC, rescheduleC); + + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Check finished for object .*")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Executing check for .*")); + BOOST_CHECK_EQUAL(0, resultCount); +} + +BOOST_AUTO_TEST_CASE(globally_disabled_checks) +{ + IcingaApplication::GetInstance()->SetEnableHostChecks(false); // Disable active host checks globally + + RegisterCheckablesRandom(4); + SleepFor(.3, true); + + auto failedC = testLogger->CountExpectedLogPattern("Skipping check for host .*: active host checks are disabled"); + BOOST_CHECK_MESSAGE(10 <= failedC && failedC <= 13, "got=" << failedC); + + auto rescheduleC = testLogger->CountExpectedLogPattern("Checks for checkable .* are disabled. Rescheduling check."); + BOOST_CHECK_MESSAGE(10 <= rescheduleC && rescheduleC <= 13, "got=" << rescheduleC); + BOOST_CHECK_EQUAL(failedC, rescheduleC); + + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Check finished for object .*")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Executing check for .*")); + BOOST_CHECK_EQUAL(0, resultCount); +} + +BOOST_AUTO_TEST_CASE(unreachable_checkable) +{ + // Create a dependency that makes the host unreachable (i.e. no checks should be executed). + // This must be done before activating the actual child checkable, otherwise the checker will + // immediately schedule a check before the dependency is in place. + RegisterCheckablesRandom(4, false, true); + SleepFor(.3, true); + + auto failedC = testLogger->CountExpectedLogPattern("Skipping check for object .*: Dependency failed."); + BOOST_CHECK_MESSAGE(10 <= failedC && failedC <= 13, "got=" << failedC); + + auto rescheduleC = testLogger->CountExpectedLogPattern("Checks for checkable .* are disabled. Rescheduling check."); + BOOST_CHECK_MESSAGE(10 <= rescheduleC && rescheduleC <= 13, "got=" << rescheduleC); + BOOST_CHECK_EQUAL(failedC, rescheduleC); + + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Check finished for object .*")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Executing check for .*")); + BOOST_CHECK_EQUAL(0, resultCount); +} + +BOOST_AUTO_TEST_CASE(never_in_check_period) +{ + TimePeriod::Ptr period = new TimePeriod; + period->SetName("never"); + period->SetUpdate(new Function("LegacyTimePeriod", LegacyTimePeriod::ScriptFunc, {"tp", "begin", "end"}), true); + period->Register(); + period->PreActivate(); + period->Activate(); + + // Register some checkables that are only checked during the "never" time period, which is never. + (void)RegisterCheckable("host-1", "random", "never"); + (void)RegisterCheckable("host-2", "random", "never"); + (void)RegisterCheckable("host-3", "sleep", "never"); + (void)RegisterCheckable("host-4", "sleep", "never"); + + SleepFor(.3, true); + + for (auto& [host, nextCheck] : nextCheckTimes) { + BOOST_TEST_MESSAGE("Host " << std::quoted(host) << " -> next_check: " << std::fixed << std::setprecision(0) + << nextCheck << " expected: " << Convert::ToDouble(period->GetValidEnd())); + // The checker should ignore the regular check interval and instead set the next check time based on the tp. + BOOST_CHECK_EQUAL(nextCheck, Convert::ToDouble(period->GetValidEnd())); + } + + // We expect that no checks are executed, and instead the checker reschedules the checks for the + // next valid end time of the "never" time period, which is always 24h from now. So, we should see + // 4 log messages about skipping the checks due to the time period, and nothing else. + BOOST_CHECK_EQUAL(4, testLogger->CountExpectedLogPattern("Skipping check for object .*, as not in check period 'never', until .*")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Checks for checkable .* are disabled. Rescheduling check.")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Executing check for .*")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Check finished for object .*")); + BOOST_CHECK_EQUAL(0, resultCount); +} + +BOOST_AUTO_TEST_CASE(in_check_period) +{ + TimePeriod::Ptr period = new TimePeriod; + period->SetName("24x7"); + period->SetRanges( + new Dictionary{ + {"monday", "00:00-24:00"}, + {"tuesday", "00:00-24:00"}, + {"wednesday", "00:00-24:00"}, + {"thursday", "00:00-24:00"}, + {"friday", "00:00-24:00"}, + {"saturday", "00:00-24:00"}, + {"sunday", "00:00-24:00"} + }, + true + ); + period->SetUpdate(new Function("LegacyTimePeriod", LegacyTimePeriod::ScriptFunc, {"tp", "begin", "end"}), true); + period->Register(); + period->PreActivate(); + period->Activate(); + + // Register some checkables that are only checked during the "always" time period, which is always. + (void)RegisterCheckable("host-1", "random", "always"); + (void)RegisterCheckable("host-2", "random", "always"); + (void)RegisterCheckable("host-3", "sleep", "always"); + (void)RegisterCheckable("host-4", "sleep", "always"); + + SleepFor(.3, true); + + // We expect that checks are executed normally, and the checker sets the next check time based + // on the regular check interval. So, we should see multiple checks executed for each checkable. + BOOST_CHECK(ExpectLogPattern("Executing check for .*")); + BOOST_CHECK(ExpectLogPattern("Check finished for object .*")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Skipping check for object .*: Dependency failed.")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Checks for checkable .* are disabled. Rescheduling check.")); + BOOST_CHECK_MESSAGE(6 <= resultCount && resultCount <= 8, "got=" << resultCount); +} + +BOOST_AUTO_TEST_CASE(max_concurrent_checks) +{ + // Limit the number of concurrent checks to 4. + ScriptGlobal::Set("MaxConcurrentChecks", 4); + + // Register 16 checkables that each sleep for 10 seconds when executing their check. + // With a max concurrent check limit of 4, we should see that only 4 checks are executed + // at the same time, and the remaining 12 checks are queued until one of the running checks + // finishes (which will not happen within the short sleep time of this test). + RegisterCheckablesSleep(16, 10); + Utility::Sleep(.5); + + auto objects(ConfigType::GetObjectsByType()); + BOOST_CHECK_EQUAL(16, objects.size()); + + for (auto& h : objects) { + // Force a reschedule of the checks to see whether the checker does absolutely nothing + // when the max concurrent check limit is reached. Normally, this would force the checker + // to immediately pick up the checkable and execute its check, but since all 4 slots are + // already taken, the checker should just update its queue idx and do nothing else. + Checkable::OnRescheduleCheck(h, Utility::GetTime()); + } + Utility::Sleep(.5); + + // We expect that only 4 checks are started initially, and the other 12 checks should have + // never been run, since the sleep time for each check (10 seconds) is much longer than the + // total sleep time of this test (1 second). + BOOST_CHECK(ExpectLogPattern("Pending checkables: 4; Idle checkables: 12; Checks/s: .*")); + BOOST_CHECK_EQUAL(4, testLogger->CountExpectedLogPattern("Scheduling info for checkable .*: Object .*")); + BOOST_CHECK_EQUAL(4, testLogger->CountExpectedLogPattern("Executing check for .*")); + BOOST_CHECK_EQUAL(4, Checkable::GetPendingChecks()); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Check finished for object .*")); // none finished yet + BOOST_CHECK_EQUAL(0, resultCount); +} + +BOOST_AUTO_TEST_CASE(skipped_remote_checks) +{ + // The check execution for remote checks is skipped if the remote endpoint is not connected, + // not syncing, and we are within the cold startup window (5min after application start). + Application::GetInstance()->SetStartTime(Utility::GetTime()); + + // Set the check and retry intervals to 60 seconds, since it's sufficient to + // just verify that the checks are skipped only once, and not repeatedly. + checkInterval = 60; + retryInterval = 60; + + RegisterRemoteChecks(8); + SleepFor(.3, true); + + for (auto& [host, nextCheck] : nextCheckTimes) { + BOOST_TEST_MESSAGE("Host " << std::quoted(host) << " -> next_check: " << std::fixed << std::setprecision(0) + << nextCheck << " roughly expected: " << Utility::GetTime() + checkInterval); + // Our algorithm for computing the next check time is not too precise, but it should roughly be within 5s of + // the expected next check time based on the check interval. See Checkable::UpdateNextCheck() for details. + BOOST_CHECK_GE(nextCheck, Utility::GetTime() + checkInterval-5); + BOOST_CHECK_LE(nextCheck, Utility::GetTime() + checkInterval); // but not more than the interval + } + + BOOST_CHECK_EQUAL(nullptr, lastResult.load()); // No check results should be received + BOOST_CHECK_EQUAL(8, testLogger->CountExpectedLogPattern("Executing check for .*")); + BOOST_CHECK_EQUAL(8, testLogger->CountExpectedLogPattern("Check finished for object .*")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Skipping check for object .*: Dependency failed.")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Checks for checkable .* are disabled. Rescheduling check.")); + BOOST_CHECK_EQUAL(0, resultCount); +} + +BOOST_AUTO_TEST_CASE(remote_checks_outside_cold_startup) +{ + Application::GetInstance()->SetStartTime(Utility::GetTime()-500); // Simulate being outside cold startup window + + checkInterval = 60; + retryInterval = 60; + + RegisterRemoteChecks(8); + SleepFor(.3, true); + + for (auto& [host, nextCheck] : nextCheckTimes) { + BOOST_TEST_MESSAGE("Host " << std::quoted(host) << " -> next_check: " << std::fixed << std::setprecision(0) + << nextCheck << " roughly expected: " << Utility::GetTime() + checkInterval); + // Our algorithm for computing the next check time is not too precise, but it should roughly be within 5s of + // the expected next check time based on the check interval. See Checkable::UpdateNextCheck() for details. + BOOST_CHECK_GE(nextCheck, Utility::GetTime() + checkInterval-5); + BOOST_CHECK_LE(nextCheck, Utility::GetTime() + checkInterval); // but not more than the interval + } + + BOOST_CHECK_EQUAL(8, testLogger->CountExpectedLogPattern("Executing check for .*")); + BOOST_CHECK_EQUAL(8, testLogger->CountExpectedLogPattern("Check finished for object .*")); + BOOST_CHECK_EQUAL(8, resultCount); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Skipping check for object .*: Dependency failed.")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Checks for checkable .* are disabled. Rescheduling check.")); + + BOOST_REQUIRE(lastResult.load()); // We should now have a cr! + + auto cr = lastResult.load(); + BOOST_CHECK_EQUAL(ServiceUnknown, cr->GetState()); + String expectedOutput = "Remote Icinga instance 'remote-checker' is not connected to '" + Endpoint::GetLocalEndpoint()->GetName() + "'"; + BOOST_CHECK_EQUAL(expectedOutput, cr->GetOutput()); +} + +BOOST_AUTO_TEST_CASE(remote_checks_with_connected_endpoint) +{ + // Register a remote endpoint that is connected, so remote checks can be executed or actually simulate + // sending the check command to the remote endpoint. In this case, we shouldn't also receive any cr, + // but checker should reschedule the check at "now + check_timeout (which is 0) + 30s". + RegisterRemoteChecks(8, true); + SleepFor(.3, true); + + for (auto& [host, nextCheck] : nextCheckTimes) { + // In this case, there shouldn't be any OnNextCheckChanged events triggered, + // thus nextCheck should still be 0.0 as initialized by RegisterCheckable(). + BOOST_CHECK_EQUAL(0.0, nextCheck); + } + + auto hosts(ConfigType::GetObjectsByType()); + BOOST_CHECK_EQUAL(8, hosts.size()); + + for (auto& h : hosts) { + // Verify the next_check time is set to roughly now + 30s, since RegisterCheckable() + // initializes the check_timeout to 0. + BOOST_TEST_MESSAGE("Host " << std::quoted(h->GetName().GetData()) << " -> next_check: " << std::fixed + << std::setprecision(0) << h->GetNextCheck() << " roughly expected: " << Utility::GetTime() + 30); + BOOST_CHECK_GE(h->GetNextCheck(), Utility::GetTime() + 25); + BOOST_CHECK_LE(h->GetNextCheck(), Utility::GetTime() + 30); + } + + BOOST_CHECK_EQUAL(8, testLogger->CountExpectedLogPattern("Sending message 'event::ExecuteCommand' to 'remote-checker'")); + BOOST_CHECK_EQUAL(8, testLogger->CountExpectedLogPattern("Executing check for .*")); + BOOST_CHECK_EQUAL(8, testLogger->CountExpectedLogPattern("Check finished for object .*")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Skipping check for object .*: Dependency failed.")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Checks for checkable .* are disabled. Rescheduling check.")); + BOOST_CHECK_EQUAL(0, resultCount); // No check results should be received yet +} + +BOOST_AUTO_TEST_SUITE_END()