diff --git a/lib/checker/checkercomponent.cpp b/lib/checker/checkercomponent.cpp index 04583833a0..d400b2b10c 100644 --- a/lib/checker/checkercomponent.cpp +++ b/lib/checker/checkercomponent.cpp @@ -55,6 +55,9 @@ void CheckerComponent::OnConfigLoaded() Checkable::OnNextCheckChanged.connect([this](const Checkable::Ptr& checkable, const Value&) { NextCheckChangedHandler(checkable); }); + Checkable::OnRescheduleCheck.connect([this](const Checkable::Ptr& checkable, double nextCheck) { + NextCheckChangedHandler(checkable, nextCheck); + }); } void CheckerComponent::Start(bool runtimeCreated) @@ -68,7 +71,7 @@ void CheckerComponent::Start(bool runtimeCreated) m_Thread = std::thread([this]() { CheckThreadProc(); }); m_ResultTimer = Timer::Create(); - m_ResultTimer->SetInterval(5); + m_ResultTimer->SetInterval(m_ResultTimerInterval); m_ResultTimer->OnTimerExpired.connect([this](const Timer * const&) { ResultTimerHandler(); }); m_ResultTimer->Start(); } @@ -135,7 +138,6 @@ void CheckerComponent::CheckThreadProc() bool forced = checkable->GetForceNextCheck(); bool check = true; - bool notifyNextCheck = false; double nextCheck = -1; if (!forced) { @@ -144,7 +146,6 @@ void CheckerComponent::CheckThreadProc() << "Skipping check for object '" << checkable->GetName() << "': Dependency failed."; check = false; - notifyNextCheck = true; } Host::Ptr host; @@ -181,7 +182,6 @@ void CheckerComponent::CheckThreadProc() << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", nextCheck); check = false; - notifyNextCheck = true; } } } @@ -200,11 +200,6 @@ void CheckerComponent::CheckThreadProc() checkable->UpdateNextCheck(); } - if (notifyNextCheck) { - // Trigger update event for Icinga DB - Checkable::OnNextCheckUpdated(checkable); - } - lock.lock(); continue; @@ -341,7 +336,7 @@ CheckableScheduleInfo CheckerComponent::GetCheckableScheduleInfo(const Checkable return csi; } -void CheckerComponent::NextCheckChangedHandler(const Checkable::Ptr& checkable) +void CheckerComponent::NextCheckChangedHandler(const Checkable::Ptr& checkable, double nextCheck) { std::unique_lock lock(m_Mutex); @@ -356,7 +351,13 @@ void CheckerComponent::NextCheckChangedHandler(const Checkable::Ptr& checkable) idx.erase(checkable); - CheckableScheduleInfo csi = GetCheckableScheduleInfo(checkable); + CheckableScheduleInfo csi; + if (nextCheck < 0) { + csi = GetCheckableScheduleInfo(checkable); + } else { + csi.NextCheck = nextCheck; + csi.Object = checkable; + } idx.insert(csi); m_CV.notify_all(); @@ -375,3 +376,18 @@ unsigned long CheckerComponent::GetPendingCheckables() return m_PendingCheckables.size(); } + +/** + * Sets the interval in seconds for the result timer. + * + * The result timer periodically logs the number of pending and idle checkables + * as well as the checks per second rate. The default interval is 5 seconds. + * + * Note, this method must be called before the component is started to have an effect on the timer. + * + * @param interval Interval in seconds for the result timer. + */ +void CheckerComponent::SetResultTimerInterval(double interval) +{ + m_ResultTimerInterval = interval; +} diff --git a/lib/checker/checkercomponent.hpp b/lib/checker/checkercomponent.hpp index edd3775e58..dfd34f540e 100644 --- a/lib/checker/checkercomponent.hpp +++ b/lib/checker/checkercomponent.hpp @@ -26,21 +26,25 @@ struct CheckableScheduleInfo { Checkable::Ptr Object; double NextCheck; -}; - -/** - * @ingroup checker - */ -struct CheckableNextCheckExtractor -{ - typedef double result_type; /** - * @threadsafety Always. + * Get the index value for ordering in the multi-index container. + * + * This function returns a very large value for checkables that have a running check, effectively pushing + * them to the end of the ordering. This ensures that checkables with running checks are not prioritized + * for scheduling ahead of others. Rescheduling of such checkables is unnecessary because the checkable + * is going to reject this anyway if it notices that a check is already running, so avoiding unnecessary + * CPU load. Once the running check is finished, the checkable will be re-inserted into the set with its + * actual next check time as the index value. + * + * @return The index value for ordering in the multi-index container. */ - double operator()(const CheckableScheduleInfo& csi) + double Index() const { - return csi.NextCheck; + if (Object->HasRunningCheck()) { + return std::numeric_limits::max(); + } + return NextCheck; } }; @@ -57,7 +61,7 @@ class CheckerComponent final : public ObjectImpl CheckableScheduleInfo, boost::multi_index::indexed_by< boost::multi_index::ordered_unique >, - boost::multi_index::ordered_non_unique + boost::multi_index::ordered_non_unique> > > CheckableSet; @@ -69,12 +73,16 @@ class CheckerComponent final : public ObjectImpl unsigned long GetIdleCheckables(); unsigned long GetPendingCheckables(); + void SetResultTimerInterval(double interval); + private: std::mutex m_Mutex; std::condition_variable m_CV; bool m_Stopped{false}; std::thread m_Thread; + double m_ResultTimerInterval{5.0}; // Interval in seconds for the result timer. + CheckableSet m_IdleCheckables; CheckableSet m_PendingCheckables; @@ -86,12 +94,8 @@ class CheckerComponent final : public ObjectImpl void ExecuteCheckHelper(const Checkable::Ptr& checkable); - void AdjustCheckTimer(); - void ObjectHandler(const ConfigObject::Ptr& object); - void NextCheckChangedHandler(const Checkable::Ptr& checkable); - - void RescheduleCheckTimer(); + void NextCheckChangedHandler(const Checkable::Ptr& checkable, double nextCheck = -1); static CheckableScheduleInfo GetCheckableScheduleInfo(const Checkable::Ptr& checkable); }; diff --git a/lib/db_ido/dbevents.cpp b/lib/db_ido/dbevents.cpp index cf80a0226d..11a83d2c12 100644 --- a/lib/db_ido/dbevents.cpp +++ b/lib/db_ido/dbevents.cpp @@ -40,7 +40,7 @@ void DbEvents::StaticInitialize() DbEvents::RemoveAcknowledgement(checkable); }); - Checkable::OnNextCheckUpdated.connect([](const Checkable::Ptr& checkable) { NextCheckUpdatedHandler(checkable); }); + Checkable::OnNextCheckChanged.connect([](const Checkable::Ptr& checkable, const Value&) { NextCheckUpdatedHandler(checkable); }); Checkable::OnFlappingChanged.connect([](const Checkable::Ptr& checkable, const Value&) { FlappingChangedHandler(checkable); }); Checkable::OnNotificationSentToAllUsers.connect([](const Notification::Ptr& notification, const Checkable::Ptr& checkable, const std::set&, const NotificationType&, const CheckResult::Ptr&, const String&, const String&, diff --git a/lib/icinga/apiactions.cpp b/lib/icinga/apiactions.cpp index e7c2e35109..3586bf2ecd 100644 --- a/lib/icinga/apiactions.cpp +++ b/lib/icinga/apiactions.cpp @@ -159,9 +159,6 @@ Dictionary::Ptr ApiActions::RescheduleCheck(const ConfigObject::Ptr& object, checkable->SetNextCheck(nextCheck); - /* trigger update event for DB IDO */ - Checkable::OnNextCheckUpdated(checkable); - return ApiActions::CreateResult(200, "Successfully rescheduled check for object '" + checkable->GetName() + "'."); } diff --git a/lib/icinga/checkable-check.cpp b/lib/icinga/checkable-check.cpp index fe7b651bed..963dde38dc 100644 --- a/lib/icinga/checkable-check.cpp +++ b/lib/icinga/checkable-check.cpp @@ -22,7 +22,7 @@ boost::signals2::signal Checkable::OnStateChange; boost::signals2::signal, const MessageOrigin::Ptr&)> Checkable::OnReachabilityChanged; boost::signals2::signal Checkable::OnNotificationsRequested; -boost::signals2::signal Checkable::OnNextCheckUpdated; +boost::signals2::signal Checkable::OnRescheduleCheck; Atomic Checkable::CurrentConcurrentChecks (0); @@ -45,12 +45,22 @@ void Checkable::SetSchedulingOffset(long offset) m_SchedulingOffset = offset; } -long Checkable::GetSchedulingOffset() +long Checkable::GetSchedulingOffset() const { return m_SchedulingOffset; } -void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin) +/** + * Update the next check time of this checkable based on its check interval and last check time. + * + * If onlyReschedule is true, the next check time is not actually updated, but the @c Checkable::OnRescheduleCheck + * signal is emitted with the new calculated next check time. Otherwise, the next check time is updated + * and the @c Checkable::OnNextCheckChanged signal is emitted accordingly. + * + * @param origin The origin of the message triggering this update, can be nullptr. + * @param onlyReschedule If true, only emit @c OnRescheduleCheck without updating the next check time. + */ +void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin, bool onlyReschedule) { double interval; @@ -78,7 +88,14 @@ void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin) << " (" << lastCheck << ") to next check time at " << Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", nextCheck) << " (" << nextCheck << ")."; - SetNextCheck(nextCheck, false, origin); + if (onlyReschedule) { + // Someone requested to only reschedule the next check without actually changing it. + // So, just tell the checker about this new timestamp and return. + OnRescheduleCheck(this, nextCheck); + } else { + // Otherwise, set the next check to the newly calculated timestamp and inform all its listeners. + SetNextCheck(nextCheck, false, origin); + } } bool Checkable::HasBeenChecked() const @@ -86,6 +103,11 @@ bool Checkable::HasBeenChecked() const return GetLastCheckResult() != nullptr; } +bool Checkable::HasRunningCheck() const +{ + return m_CheckRunning; +} + double Checkable::GetLastCheck() const { CheckResult::Ptr cr = GetLastCheckResult(); @@ -105,7 +127,7 @@ Checkable::ProcessingResult Checkable::ProcessCheckResult(const CheckResult::Ptr VERIFY(producer); ObjectLock olock(this); - m_CheckRunning = false; + m_CheckRunning.store(false); double now = Utility::GetTime(); @@ -513,12 +535,15 @@ Checkable::ProcessingResult Checkable::ProcessCheckResult(const CheckResult::Ptr if (recovery) { for (auto& child : children) { if (child->GetProblem() && child->GetEnableActiveChecks()) { - auto nextCheck (now + Utility::Random() % 60); - - ObjectLock oLock (child); - - if (nextCheck < child->GetNextCheck()) { - child->SetNextCheck(nextCheck); + if (auto nextCheck (now + Utility::Random() % 60); nextCheck < child->GetNextCheck()) { + /** + * We only want to enforce the checker to pick this up sooner, and no need to actually change + * the timesatmp. Plus, no other listeners should be informed about this other than the checker, + * so we emit the OnRescheduleCheck signal directly. In case our checker isn't responsible for + * this child object, we've already broadcasted the `CheckResult` event which will cause on the + * responsible node to enter this exact branch and do the rescheduling for its own checker. + */ + OnRescheduleCheck(child, nextCheck); } } } @@ -534,8 +559,8 @@ Checkable::ProcessingResult Checkable::ProcessCheckResult(const CheckResult::Ptr continue; if (parent->GetNextCheck() >= now + parent->GetRetryInterval()) { - ObjectLock olock(parent); - parent->SetNextCheck(now); + // See comment above for children. We want to just enforce an immediate check by our checker. + OnRescheduleCheck(parent, now); } } } @@ -561,29 +586,21 @@ void Checkable::ExecuteCheck(const WaitGroup::Ptr& producer) { CONTEXT("Executing check for object '" << GetName() << "'"); + /* don't run another check if there is one pending */ + if (m_CheckRunning.exchange(true)) + return; // Should never happen as the checker already takes care of this. + /* keep track of scheduling info in case the check type doesn't provide its own information */ double scheduled_start = GetNextCheck(); double before_check = Utility::GetTime(); SetLastCheckStarted(Utility::GetTime()); - /* This calls SetNextCheck() which updates the CheckerComponent's idle/pending - * queues and ensures that checks are not fired multiple times. ProcessCheckResult() - * is called too late. See #6421. - */ - UpdateNextCheck(); - bool reachable = IsReachable(); { ObjectLock olock(this); - /* don't run another check if there is one pending */ - if (m_CheckRunning) - return; - - m_CheckRunning = true; - SetLastStateRaw(GetStateRaw()); SetLastStateType(GetLastStateType()); SetLastReachable(reachable); @@ -640,11 +657,16 @@ void Checkable::ExecuteCheck(const WaitGroup::Ptr& producer) if (listener) listener->SyncSendMessage(endpoint, message); - /* Re-schedule the check so we don't run it again until after we've received - * a check result from the remote instance. The check will be re-scheduled - * using the proper check interval once we've received a check result. + /* + * Let the checker use a dummy next check time until we actually receive the check result from the + * remote endpoint. This should be sufficiently far in the future to avoid excessive CPU load by + * constantly re-running the check, but not too far in the future to avoid that the check is not + * re-run for too long in case the remote endpoint never responds. We add a small grace period + * to the check command timeout to account for network latency and processing time on the remote + * endpoint. So, we only need to silently update this without notifying any listeners, and once + * this function returns, the checker is going access it via GetNextCheck() again. */ - SetNextCheck(Utility::GetTime() + checkTimeout + 30); + SetNextCheck(Utility::GetTime() + checkTimeout + 30, true); /* * Let the user know that there was a problem with the check if @@ -667,12 +689,22 @@ void Checkable::ExecuteCheck(const WaitGroup::Ptr& producer) cr->SetOutput(output); ProcessCheckResult(cr, producer); + } else { + /** + * The endpoint is currently either syncing its state or not connected yet and we are within + * the magical 5min cold startup window. In both cases, we just don't do anything and wait for + * the next check interval to re-try the check again. So, this check is effectively skipped. + */ + UpdateNextCheck(); } - { - ObjectLock olock(this); - m_CheckRunning = false; - } + /** + * If this is a remote check, we don't know when the check result will be received and processed. + * Therefore, we must mark the check as no longer running here, otherwise, no further checks + * would be executed for this checkable as it would always appear as having a running check + * (see the check at the start of this function). + */ + m_CheckRunning.store(false); } } diff --git a/lib/icinga/checkable.cpp b/lib/icinga/checkable.cpp index dbe73fe555..3a1262744f 100644 --- a/lib/icinga/checkable.cpp +++ b/lib/icinga/checkable.cpp @@ -88,14 +88,26 @@ void Checkable::Start(bool runtimeCreated) auto cr (GetLastCheckResult()); if (GetLastCheckStarted() > (cr ? cr->GetExecutionEnd() : 0.0)) { - SetNextCheck(GetLastCheckStarted()); + /** + * Each node performs a check immediately after startup before the object authority is determined. + * Meaning, for runtime created objects we should not enter this branch and for non-runtime created + * objects we're going to call `ApiListener::UpdateObjectAuthority()` after all objects have been + * started, so we can safely assume that this timestamp is only relevant for our own checker. + * + * However, until the authority is determined, by default all objects are paused and thus triggering + * `OnRescheduleCheck` would have no effect, as the checker component ignores paused objects. Therefore, + * our only option is to set this dummy ts silently here, so that when the object is unpaused later, + * the checker component will pick it up and schedule the next check properly. + */ + SetNextCheck(GetLastCheckStarted(), true); } } if (GetNextCheck() < now + 60) { double delta = std::min(GetCheckInterval(), 60.0); delta *= (double)std::rand() / RAND_MAX; - SetNextCheck(now + delta); + // We only want to jitter the next check a bit, and inform the scheduler about it, so not setting it directly. + SetNextCheck(now + delta, true); } ObjectImpl::Start(runtimeCreated); diff --git a/lib/icinga/checkable.hpp b/lib/icinga/checkable.hpp index 796421a9b6..b4ae94f1ee 100644 --- a/lib/icinga/checkable.hpp +++ b/lib/icinga/checkable.hpp @@ -101,14 +101,15 @@ class Checkable : public ObjectImpl intrusive_ptr GetCheckCommand() const; TimePeriod::Ptr GetCheckPeriod() const; - long GetSchedulingOffset(); + long GetSchedulingOffset() const; void SetSchedulingOffset(long offset); - void UpdateNextCheck(const MessageOrigin::Ptr& origin = nullptr); + void UpdateNextCheck(const MessageOrigin::Ptr& origin = nullptr, bool onlyReschedule = false); static String StateTypeToString(StateType type); bool HasBeenChecked() const; + bool HasRunningCheck() const; virtual bool IsStateOK(ServiceState state) const = 0; double GetLastCheck() const final; @@ -146,7 +147,18 @@ class Checkable : public ObjectImpl bool, bool, double, double, const MessageOrigin::Ptr&)> OnAcknowledgementSet; static boost::signals2::signal OnAcknowledgementCleared; static boost::signals2::signal OnFlappingChange; - static boost::signals2::signal OnNextCheckUpdated; + /** + * Think again! Are you really sure you want to subscribe to this signal? + * + * This signal is a very special and noisy one. It is emitted whenever someone wants to enforce the + * LOCAL scheduler to reschedule the next check of a checkable at the given timestamp. This can be + * due to a number of reasons, for example: Icinga 2 was interrupted while a check was being executed, + * and the checkable needs to be rescheduled on startup; or a parent host changed its state and all its + * children need to be rescheduled to reflect the new reachability state. There are other cases as well, + * but the point is: this signal is meant to only be used by the local @c CheckerComponent to update its + * internal queues. If you want to use this for something else, think twice!! + */ + static boost::signals2::signal OnRescheduleCheck; static boost::signals2::signal OnEventCommandExecuted; static Atomic CurrentConcurrentChecks; @@ -223,7 +235,7 @@ class Checkable : public ObjectImpl private: mutable std::mutex m_CheckableMutex; - bool m_CheckRunning{false}; + Atomic m_CheckRunning{false}; long m_SchedulingOffset; static std::mutex m_StatsMutex; diff --git a/lib/icinga/clusterevents.cpp b/lib/icinga/clusterevents.cpp index d76a9351aa..e0ef3c00c8 100644 --- a/lib/icinga/clusterevents.cpp +++ b/lib/icinga/clusterevents.cpp @@ -46,7 +46,9 @@ REGISTER_APIFUNCTION(SetRemovalInfo, event, &ClusterEvents::SetRemovalInfoAPIHan void ClusterEvents::StaticInitialize() { Checkable::OnNewCheckResult.connect(&ClusterEvents::CheckResultHandler); - Checkable::OnNextCheckChanged.connect(&ClusterEvents::NextCheckChangedHandler); + Checkable::OnNextCheckChanged.connect([](const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin) { + NextCheckChangedHandler(checkable, origin); + }); Checkable::OnLastCheckStartedChanged.connect(&ClusterEvents::LastCheckStartedChangedHandler); Checkable::OnStateBeforeSuppressionChanged.connect(&ClusterEvents::StateBeforeSuppressionChangedHandler); Checkable::OnSuppressedNotificationsChanged.connect(&ClusterEvents::SuppressedNotificationsChangedHandler); @@ -183,7 +185,7 @@ Value ClusterEvents::CheckResultAPIHandler(const MessageOrigin::Ptr& origin, con return Empty; } -void ClusterEvents::NextCheckChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin) +void ClusterEvents::NextCheckChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin, bool isFromOldClient) { ApiListener::Ptr listener = ApiListener::GetInstance(); @@ -199,6 +201,7 @@ void ClusterEvents::NextCheckChangedHandler(const Checkable::Ptr& checkable, con if (service) params->Set("service", service->GetShortName()); params->Set("next_check", checkable->GetNextCheck()); + params->Set("origin_client_old", isFromOldClient); Dictionary::Ptr message = new Dictionary(); message->Set("jsonrpc", "2.0"); @@ -245,7 +248,25 @@ Value ClusterEvents::NextCheckChangedAPIHandler(const MessageOrigin::Ptr& origin if (nextCheck < Application::GetStartTime() + 60) return Empty; - checkable->SetNextCheck(params->Get("next_check"), false, origin); + bool isOriginClientOld = params->Get("origin_client_old").ToBool(); + if (isOriginClientOld || endpoint->GetIcingaVersion() < 21600) { // TODO: Exact required version?? + // All older Icinga 2 versions trigger the `event::SetNextCheck` event endlessly, so we can't let the + // `Checkable::OnNextCheckChanged` know about this change, as it would cause too much noise in Icinga DB + // and the like. So, we just update the timestamp silently and manually call `Checkable::OnRescheduleCheck` + // to inform the checker about this change. + checkable->SetNextCheck(params->Get("next_check"), true, origin); + Checkable::OnRescheduleCheck(checkable, nextCheck); + // In case this message has to be relayed to other cluster nodes we directly call the `NextCheckChangedHandler`. + // The `isOriginClientOld` flag is necessary e.g. in a 3-level cluster setup where a very old agent sends + // the message to its satellite that then relays it to the master. The master then needs to know that this + // event originates from an old node and thus has to be treated specially. + NextCheckChangedHandler(checkable, origin, true); + } else { + // Peer is running the expected Icinga 2 version, so we can just do the normal flow. + // The `Checkable::OnNextCheckChanged` signal will be emitted as usual and all listeners + // will be informed about this change. + checkable->SetNextCheck(params->Get("next_check"), false, origin); + } return Empty; } diff --git a/lib/icinga/clusterevents.hpp b/lib/icinga/clusterevents.hpp index 8daf86ab3b..dd01b3a552 100644 --- a/lib/icinga/clusterevents.hpp +++ b/lib/icinga/clusterevents.hpp @@ -23,7 +23,7 @@ class ClusterEvents static void CheckResultHandler(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr, const MessageOrigin::Ptr& origin); static Value CheckResultAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params); - static void NextCheckChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin); + static void NextCheckChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin, bool isFromOldClient = false); static Value NextCheckChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params); static void LastCheckStartedChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin); diff --git a/lib/icinga/externalcommandprocessor.cpp b/lib/icinga/externalcommandprocessor.cpp index 33831a4910..b61f640fd2 100644 --- a/lib/icinga/externalcommandprocessor.cpp +++ b/lib/icinga/externalcommandprocessor.cpp @@ -392,9 +392,6 @@ void ExternalCommandProcessor::ScheduleHostCheck(double, const std::vectorSetNextCheck(planned_check); - - /* trigger update event for DB IDO */ - Checkable::OnNextCheckUpdated(host); } void ExternalCommandProcessor::ScheduleForcedHostCheck(double, const std::vector& arguments) @@ -409,9 +406,6 @@ void ExternalCommandProcessor::ScheduleForcedHostCheck(double, const std::vector host->SetForceNextCheck(true); host->SetNextCheck(Convert::ToDouble(arguments[1])); - - /* trigger update event for DB IDO */ - Checkable::OnNextCheckUpdated(host); } void ExternalCommandProcessor::ScheduleSvcCheck(double, const std::vector& arguments) @@ -437,9 +431,6 @@ void ExternalCommandProcessor::ScheduleSvcCheck(double, const std::vectorSetNextCheck(planned_check); - - /* trigger update event for DB IDO */ - Checkable::OnNextCheckUpdated(service); } void ExternalCommandProcessor::ScheduleForcedSvcCheck(double, const std::vector& arguments) @@ -454,9 +445,6 @@ void ExternalCommandProcessor::ScheduleForcedSvcCheck(double, const std::vector< service->SetForceNextCheck(true); service->SetNextCheck(Convert::ToDouble(arguments[2])); - - /* trigger update event for DB IDO */ - Checkable::OnNextCheckUpdated(service); } void ExternalCommandProcessor::EnableHostCheck(double, const std::vector& arguments) @@ -538,9 +526,6 @@ void ExternalCommandProcessor::ScheduleForcedHostSvcChecks(double, const std::ve service->SetNextCheck(planned_check); service->SetForceNextCheck(true); - - /* trigger update event for DB IDO */ - Checkable::OnNextCheckUpdated(service); } } @@ -568,9 +553,6 @@ void ExternalCommandProcessor::ScheduleHostSvcChecks(double, const std::vectorGetName() << "'"; service->SetNextCheck(planned_check); - - /* trigger update event for DB IDO */ - Checkable::OnNextCheckUpdated(service); } } diff --git a/lib/icingadb/icingadb-objects.cpp b/lib/icingadb/icingadb-objects.cpp index a8c2ed0b2c..390d30063f 100644 --- a/lib/icingadb/icingadb-objects.cpp +++ b/lib/icingadb/icingadb-objects.cpp @@ -135,8 +135,8 @@ void IcingaDB::ConfigStaticInitialize() IcingaDB::NewCheckResultHandler(checkable); }); - Checkable::OnNextCheckUpdated.connect([](const Checkable::Ptr& checkable) { - IcingaDB::NextCheckUpdatedHandler(checkable); + Checkable::OnNextCheckChanged.connect([](const Checkable::Ptr& checkable, const Value&) { + IcingaDB::NextCheckChangedHandler(checkable); }); Service::OnHostProblemChanged.connect([](const Service::Ptr& service, const CheckResult::Ptr&, const MessageOrigin::Ptr&) { @@ -3206,11 +3206,10 @@ void IcingaDB::NewCheckResultHandler(const Checkable::Ptr& checkable) { for (auto& rw : ConfigType::GetObjectsByType()) { rw->UpdateState(checkable, StateUpdate::Volatile); - rw->SendNextUpdate(checkable); } } -void IcingaDB::NextCheckUpdatedHandler(const Checkable::Ptr& checkable) +void IcingaDB::NextCheckChangedHandler(const Checkable::Ptr& checkable) { for (auto& rw : ConfigType::GetObjectsByType()) { rw->UpdateState(checkable, StateUpdate::Volatile); diff --git a/lib/icingadb/icingadb.hpp b/lib/icingadb/icingadb.hpp index 33b6414c89..778a13b23c 100644 --- a/lib/icingadb/icingadb.hpp +++ b/lib/icingadb/icingadb.hpp @@ -202,7 +202,7 @@ class IcingaDB : public ObjectImpl static void CommentRemovedHandler(const Comment::Ptr& comment); static void FlappingChangeHandler(const Checkable::Ptr& checkable, double changeTime); static void NewCheckResultHandler(const Checkable::Ptr& checkable); - static void NextCheckUpdatedHandler(const Checkable::Ptr& checkable); + static void NextCheckChangedHandler(const Checkable::Ptr& checkable); static void DependencyGroupChildRegisteredHandler(const Checkable::Ptr& child, const DependencyGroup::Ptr& dependencyGroup); static void DependencyGroupChildRemovedHandler(const DependencyGroup::Ptr& dependencyGroup, const std::vector& dependencies, bool removeGroup); static void HostProblemChangedHandler(const Service::Ptr& service); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index cb815da737..2d93afc565 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -129,6 +129,15 @@ set(base_test_SOURCES $ ) +if (ICINGA2_WITH_CHECKER) + list(APPEND base_test_SOURCES + checker.cpp + checker-fixture.cpp + checker-fixture.hpp + $ + ) +endif() + if(ICINGA2_UNITY_BUILD) mkunity_target(base test base_test_SOURCES) endif() diff --git a/test/base-testloggerfixture.hpp b/test/base-testloggerfixture.hpp index 69c073b02a..a3571f8f9c 100644 --- a/test/base-testloggerfixture.hpp +++ b/test/base-testloggerfixture.hpp @@ -52,6 +52,27 @@ class TestLogger : public Logger return ret; } + /** + * Counts the number of log entries that match the given regex pattern. + * + * This only counts existing log entries, it does not wait for new ones to arrive. + * + * @param pattern The regex pattern the log message needs to match + * + * @return The number of log entries that match the given pattern + */ + auto CountExpectedLogPattern(const std::string& pattern) + { + std::lock_guard lock(m_Mutex); + int count = 0; + for (const auto& logEntry : m_LogEntries) { + if (boost::regex_match(logEntry.Message.GetData(), boost::regex(pattern))) { + ++count; + } + } + return count; + } + private: void ProcessLogEntry(const LogEntry& entry) override { @@ -88,11 +109,16 @@ struct TestLoggerFixture TestLoggerFixture() { testLogger->SetSeverity(testLogger->SeverityToString(LogDebug)); - testLogger->Activate(true); testLogger->SetActive(true); + testLogger->Activate(true); } ~TestLoggerFixture() + { + DeactivateLogger(); + } + + void DeactivateLogger() const { testLogger->SetActive(false); testLogger->Deactivate(true); diff --git a/test/checker-fixture.cpp b/test/checker-fixture.cpp new file mode 100644 index 0000000000..7e5de60f57 --- /dev/null +++ b/test/checker-fixture.cpp @@ -0,0 +1,201 @@ +/* Icinga 2 | (c) 2025 Icinga GmbH | GPLv2+ */ + +#include "base/utility.hpp" +#include "config/configcompiler.hpp" +#include "remote/apilistener.hpp" +#include "test/checker-fixture.hpp" +#include "test/icingaapplication-fixture.hpp" + +using namespace icinga; + +CheckerFixture::CheckerFixture() +{ + checker = new CheckerComponent; + checker->SetResultTimerInterval(.4); // Speed up result timer for tests + checker->SetName("randomizer", true); + checker->Register(); + checker->OnConfigLoaded(); + checker->PreActivate(); + checker->Activate(); + + // Manually building and registering the command won't work here as we need a real callable + // function that produces cr and calls ProcessCheckResult on the checkable. So we use a small + // config snippet that imports the random and sleep check commands registered by the "methods-itl.cpp" + // file in the methods lib. + ConfigItem::RunWithActivationContext( + new Function{ + "checker-fixture", + [] { + std::unique_ptr expression = ConfigCompiler::CompileText( + "", + R"CONFIG( +object CheckCommand "random" { + import "random-check-command" +} + +object CheckCommand "sleep" { + import "sleep-check-command" +} +)CONFIG" + ); + BOOST_REQUIRE(expression); + ScriptFrame frame(true); + BOOST_CHECK_NO_THROW(expression->Evaluate(frame)); + } + } + ); + + Checkable::OnNewCheckResult.connect( + [this](const Checkable::Ptr&, const CheckResult::Ptr& cr, const MessageOrigin::Ptr&) { + ++resultCount; + lastResult.store(cr); + } + ); + // Track the next check times of the checkables, so we can verify that they are set to the expected + // value. The map is populated with the expected checkable names by the RegisterChecable() function. + // So, we can safely modify the map from within this signal handler without further locking, since there + // won't be any concurrent access to the same keys. + Checkable::OnNextCheckChanged.connect([this](const Checkable::Ptr& checkable, const Value&) { + assert(nextCheckTimes.find(checkable->GetName()) != nextCheckTimes.end()); + nextCheckTimes[checkable->GetName()] = checkable->GetNextCheck(); + }); +} + +CheckerFixture::~CheckerFixture() +{ + checker->Deactivate(); +} + +void CheckerFixture::RegisterCheckablesRandom(int count, bool disableChecks, bool unreachable) +{ + for (int i = 1; i <= count; ++i) { + RegisterCheckable("host-" + std::to_string(i), "random", "", "", disableChecks, unreachable); + } +} + +void CheckerFixture::RegisterCheckablesSleep(int count, double sleepTime, bool disableChecks, bool unreachable) +{ + for (int i = 1; i <= count; ++i) { + auto h = RegisterCheckable("host-" + std::to_string(i), "sleep", "", "", disableChecks, unreachable); + h->SetVars(new Dictionary{{"sleep_time", sleepTime}}); + } +} + +void CheckerFixture::RegisterRemoteChecks(int count, bool isConnected, bool isSyncingReplayLogs) +{ + RegisterEndpoint("remote-checker", isConnected, isSyncingReplayLogs); + for (int i = 1; i <= count; ++i) { + auto h = RegisterCheckable("host-"+std::to_string(i), "random", "", "remote-checker"); + Checkable::OnRescheduleCheck(h, Utility::GetTime()); // Force initial scheduling + } +} + +Host::Ptr CheckerFixture::RegisterCheckable( + std::string name, + std::string cmd, + std::string period, + std::string endpoint, + bool disableChecks, + bool unreachable +) +{ + Host::Ptr host = new Host; + host->SetName(std::move(name), true); + host->SetCheckCommandRaw(std::move(cmd), true); + host->SetCheckInterval(checkInterval, true); + host->SetRetryInterval(retryInterval, true); + host->SetHAMode(HARunEverywhere, true); // Disable HA for tests + host->SetEnableActiveChecks(!disableChecks, true); + host->SetCheckPeriodRaw(std::move(period), true); + host->SetZoneName(endpoint, true); + host->SetCommandEndpointRaw(std::move(endpoint), true); + host->SetCheckTimeout(0, true); + host->Register(); + host->OnAllConfigLoaded(); + + nextCheckTimes[host->GetName()] = 0.0; // Initialize next check time tracking + + if (unreachable) { + Host::Ptr parent = new Host; + parent->SetName(Utility::NewUniqueID(), true); + parent->SetStateRaw(ServiceCritical, true); + parent->SetStateType(StateTypeHard, true); + parent->SetLastCheckResult(new CheckResult, true); + parent->Register(); + + Dependency::Ptr dep = new Dependency; + dep->SetName(Utility::NewUniqueID(), true); + dep->SetStateFilter(StateFilterUp, true); + dep->SetDisableChecks(true, true); + dep->SetParent(parent); + dep->SetChild(host); + dep->Register(); + + host->AddDependency(dep); + } + + host->PreActivate(); + host->Activate(); + return host; +} + +void CheckerFixture::SleepFor(double seconds, bool deactivateLogger) const +{ + Utility::Sleep(seconds); + Checkable::OnNextCheckChanged.disconnect_all_slots(); + Checkable::OnNewCheckResult.disconnect_all_slots(); + if (deactivateLogger) { + DeactivateLogger(); + } +} + +Endpoint::Ptr CheckerFixture::RegisterEndpoint(std::string name, bool isConnected, bool isSyncingReplayLogs) +{ + auto makeEndpoint = [](const std::string& name, bool syncing) { + Endpoint::Ptr remote = new Endpoint; + remote->SetName(name, true); + if (syncing) { + remote->SetSyncing(true); + } + remote->Register(); + remote->PreActivate(); + return remote; + }; + + Endpoint::Ptr remote = makeEndpoint(name, isSyncingReplayLogs); + Endpoint::Ptr local = makeEndpoint("local-tester", false); + + Zone::Ptr zone = new Zone; + zone->SetName(remote->GetName(), true); + zone->SetEndpointsRaw(new Array{{remote->GetName(), local->GetName()}}, true); + zone->Register(); + zone->OnAllConfigLoaded(); + zone->PreActivate(); + zone->Activate(); + + ApiListener::Ptr listener = new ApiListener; + listener->SetIdentity(local->GetName(), true); + listener->SetName(local->GetName(), true); + listener->Register(); + try { + listener->OnAllConfigLoaded(); // Initialize the m_LocalEndpoint of the listener! + + // May throw due to various reasons, but we only care that the m_Instance singleton + // is set which it will be if no other ApiListener is registered yet. + listener->OnConfigLoaded(); + } catch (const std::exception& ex) { + BOOST_TEST_MESSAGE("Exception during ApiListener::OnConfigLoaded: " << DiagnosticInformation(ex)); + } + + if (isConnected) { + JsonRpcConnection::Ptr client = new JsonRpcConnection( + new StoppableWaitGroup, + "anonymous", + false, + nullptr, + RoleClient + ); + remote->AddClient(client); + } + return remote; +} diff --git a/test/checker-fixture.hpp b/test/checker-fixture.hpp new file mode 100644 index 0000000000..963bcde39d --- /dev/null +++ b/test/checker-fixture.hpp @@ -0,0 +1,127 @@ +/* Icinga 2 | (c) 2025 Icinga GmbH | GPLv2+ */ + +#pragma once + +#include "base/atomic.hpp" +#include "checker/checkercomponent.hpp" +#include "icinga/checkcommand.hpp" +#include "remote/endpoint.hpp" +#include "test/base-testloggerfixture.hpp" + +namespace icinga { + +/** + * Test fixture for tests involving the @c CheckerComponent. + * + * This fixture sets up a CheckerComponent instance and provides utility functions to register + * checkable objects. It is derived from @c TestLoggerFixture to capture log output during tests, + * so tests can verify that expected log messages are produced. The fixture also connects to the + * @c Checkable::OnNewCheckResult signal to count the number of check results produced during tests. + */ +struct CheckerFixture : TestLoggerFixture +{ + CheckerFixture(); + + ~CheckerFixture(); + + /** + * Registers a fully configured set of checkable hosts that execute the "random" check command. + * + * Each host is configured with a random check command, check interval, and retry interval. + * If @c unreachable is true, each host is made unreachable by adding a dependency on a parent + * host that is in a critical state. This prevents the checker from executing checks for the + * child hosts. The check and retry intervals are kept low to allow for quick test execution, + * but they can be adjusted via the @c interval and @c retry parameters. + * + * @param count Number of checkable hosts to register. + * @param disableChecks If true, disables active checks for each host. + * @param unreachable If true, makes each host unreachable via a dependency. + */ + void RegisterCheckablesRandom(int count, bool disableChecks = false, bool unreachable = false); + + /** + * Registers a fully configured set of checkable hosts that execute the "sleep" command. + * + * Each host is configured with a sleep check command that sleeps for the specified duration. + * The check and retry intervals can be adjusted via the @c checkInterval and @c retryInterval + * member variables of the fixture. If @c unreachable is true, each host is made unreachable by + * adding a dependency on a parent host that is in a critical state. This prevents the checker + * from executing checks for the child hosts. + * + * @param count Number of checkable hosts to register. + * @param sleepTime Duration (in seconds) that the sleep command should sleep. Defaults to 1.0 second. + * @param disableChecks If true, disables active checks for each host. + * @param unreachable If true, makes each host unreachable via a dependency. + */ + void RegisterCheckablesSleep( + int count, + double sleepTime = 1.0, + bool disableChecks = false, + bool unreachable = false + ); + + /** + * Registers a remote endpoint and a set of checkable hosts assigned to that endpoint. + * + * The remote endpoint can be configured to appear connected or disconnected, and can also + * be set to be syncing replay as needed for tests involving remote checks. + * + * @param count Number of checkable hosts to register. + * @param isConnected If true, the remote endpoint is marked as connected. + * @param isSyncingReplayLogs If true, the remote endpoint is marked as syncing replay logs. + */ + void RegisterRemoteChecks(int count, bool isConnected = false, bool isSyncingReplayLogs = false); + + Host::Ptr RegisterCheckable( + std::string name, + std::string cmd, + std::string period = "", + std::string endpoint = "", + bool disableChecks = false, + bool unreachable = false + ); + + /** + * Sleeps for the specified number of seconds, then immediately disconnects all signal handlers + * connected to @c Checkable::OnNextCheckChanged and @c Checkable::OnNewCheckResult. + * + * This is useful in tests to allow some time for checks to be executed and results to be processed, + * while ensuring that no further signal handlers are called after the sleep period. This helps to avoid + * unexpected side effects in tests, since the checker continues to run till the fixture is destroyed. + * + * @param seconds Number of seconds to sleep. + * @param deactivateLogger If true, deactivates the test logger after sleeping to prevent further log capture. + */ + void SleepFor(double seconds, bool deactivateLogger = false) const; + + /** + * Registers a remote endpoint with the specified name and connection/syncing state and returns it. + * + * @param name Name of the endpoint to register. + * @param isConnected If true, the endpoint is marked as connected. + * @param isSyncingReplayLogs If true, the endpoint is marked as syncing replay logs. + * + * @return The registered endpoint instance. + */ + static Endpoint::Ptr RegisterEndpoint(std::string name, bool isConnected = false, bool isSyncingReplayLogs = false); + + /** + * resultCount tracks the number of check results produced by the checker. + * + * This is used in tests to verify that checks are actually being executed and results processed. + * It is incremented from within the OnNewCheckResult signal handler, thus must be atomic. + */ + Atomic resultCount{0}; + AtomicOrLocked lastResult; // Stores the last check result received, for inspection in tests. + /** + * nextCheckTimes tracks the next scheduled check time for each registered checkable host. + * This might not be used in all tests, but is available for tests that need to verify the exact + * next check timestamp set by the checker. + */ + std::map nextCheckTimes; + double checkInterval{.1}; // Interval in seconds between regular checks for each checkable. + double retryInterval{.1}; // Interval in seconds between retry checks for each checkable. + CheckerComponent::Ptr checker; +}; + +} // namespace icinga diff --git a/test/checker.cpp b/test/checker.cpp new file mode 100644 index 0000000000..e0e0a0585c --- /dev/null +++ b/test/checker.cpp @@ -0,0 +1,311 @@ +/* Icinga 2 | (c) 2025 Icinga GmbH | GPLv2+ */ + +#include "test/checker-fixture.hpp" +#include "base/scriptglobal.hpp" +#include "icinga/host.hpp" +#include "icinga/dependency.hpp" +#include "icinga/legacytimeperiod.hpp" +#include + +using namespace icinga; + +BOOST_FIXTURE_TEST_SUITE(checker, CheckerFixture, *boost::unit_test::label("checker")) + +BOOST_AUTO_TEST_CASE(single_check) +{ + // For a single checkable, there shouldn't be any concurrent checks that trigger this event, + // so we can safely use boost assertion macros within the event handler. + Checkable::OnNextCheckChanged.connect([this](const Checkable::Ptr& checkable, const Value&) { + BOOST_CHECK_EQUAL(checkable->GetName(), "host-1"); + BOOST_CHECK_LE(checkable->GetNextCheck(), checkable->GetLastCheck() + checkInterval + .5); + }); + + RegisterCheckablesRandom(1); + SleepFor(.4, true); + + BOOST_CHECK_EQUAL(4, testLogger->CountExpectedLogPattern("Executing check for 'host-1'")); + BOOST_CHECK_EQUAL(4, testLogger->CountExpectedLogPattern("Check finished for object 'host-1'")); + BOOST_CHECK_EQUAL(4, resultCount); +} + +BOOST_AUTO_TEST_CASE(multiple_checks) +{ + Checkable::OnNextCheckChanged.connect([this](const Checkable::Ptr& checkable, const Value&) { + BOOST_CHECK_LE(checkable->GetNextCheck(), checkable->GetLastCheck() + checkInterval + .5); + }); + + RegisterCheckablesRandom(8); + SleepFor(.3, true); + + BOOST_CHECK(ExpectLogPattern("Executing check for .*")); + BOOST_CHECK(ExpectLogPattern("Check finished for object .*")); + auto executedC = testLogger->CountExpectedLogPattern("Executing check for .*"); + auto finishedC = testLogger->CountExpectedLogPattern("Check finished for object .*"); + BOOST_CHECK_EQUAL(executedC, finishedC); + // With 8 checkables and a check interval of 0.1s, we expect that each checkable is checked at least + // twice, but some of them possibly up to 3 times, depending on the timing and OS scheduling behavior. + BOOST_CHECK_MESSAGE(22 <= resultCount && resultCount <= 25, "got=" << resultCount); +} + +BOOST_AUTO_TEST_CASE(disabled_checks) +{ + RegisterCheckablesRandom(4, true); + SleepFor(.3, true); + + auto failedC = testLogger->CountExpectedLogPattern("Skipping check for host .*: active host checks are disabled"); + BOOST_CHECK_MESSAGE(10 <= failedC && failedC <= 13, "got=" << failedC); + + auto rescheduleC = testLogger->CountExpectedLogPattern("Checks for checkable .* are disabled. Rescheduling check."); + BOOST_CHECK_MESSAGE(10 <= rescheduleC && rescheduleC <= 13, "got=" << rescheduleC); + BOOST_CHECK_EQUAL(failedC, rescheduleC); + + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Check finished for object .*")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Executing check for .*")); + BOOST_CHECK_EQUAL(0, resultCount); +} + +BOOST_AUTO_TEST_CASE(globally_disabled_checks) +{ + IcingaApplication::GetInstance()->SetEnableHostChecks(false); // Disable active host checks globally + + RegisterCheckablesRandom(4); + SleepFor(.3, true); + + auto failedC = testLogger->CountExpectedLogPattern("Skipping check for host .*: active host checks are disabled"); + BOOST_CHECK_MESSAGE(10 <= failedC && failedC <= 13, "got=" << failedC); + + auto rescheduleC = testLogger->CountExpectedLogPattern("Checks for checkable .* are disabled. Rescheduling check."); + BOOST_CHECK_MESSAGE(10 <= rescheduleC && rescheduleC <= 13, "got=" << rescheduleC); + BOOST_CHECK_EQUAL(failedC, rescheduleC); + + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Check finished for object .*")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Executing check for .*")); + BOOST_CHECK_EQUAL(0, resultCount); +} + +BOOST_AUTO_TEST_CASE(unreachable_checkable) +{ + // Create a dependency that makes the host unreachable (i.e. no checks should be executed). + // This must be done before activating the actual child checkable, otherwise the checker will + // immediately schedule a check before the dependency is in place. + RegisterCheckablesRandom(4, false, true); + SleepFor(.3, true); + + auto failedC = testLogger->CountExpectedLogPattern("Skipping check for object .*: Dependency failed."); + BOOST_CHECK_MESSAGE(10 <= failedC && failedC <= 13, "got=" << failedC); + + auto rescheduleC = testLogger->CountExpectedLogPattern("Checks for checkable .* are disabled. Rescheduling check."); + BOOST_CHECK_MESSAGE(10 <= rescheduleC && rescheduleC <= 13, "got=" << rescheduleC); + BOOST_CHECK_EQUAL(failedC, rescheduleC); + + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Check finished for object .*")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Executing check for .*")); + BOOST_CHECK_EQUAL(0, resultCount); +} + +BOOST_AUTO_TEST_CASE(never_in_check_period) +{ + TimePeriod::Ptr period = new TimePeriod; + period->SetName("never"); + period->SetUpdate(new Function("LegacyTimePeriod", LegacyTimePeriod::ScriptFunc, {"tp", "begin", "end"}), true); + period->Register(); + period->PreActivate(); + period->Activate(); + + // Register some checkables that are only checked during the "never" time period, which is never. + (void)RegisterCheckable("host-1", "random", "never"); + (void)RegisterCheckable("host-2", "random", "never"); + (void)RegisterCheckable("host-3", "sleep", "never"); + (void)RegisterCheckable("host-4", "sleep", "never"); + + SleepFor(.3, true); + + for (auto& [host, nextCheck] : nextCheckTimes) { + BOOST_TEST_MESSAGE("Host " << std::quoted(host) << " -> next_check: " << std::fixed << std::setprecision(0) + << nextCheck << " expected: " << Convert::ToDouble(period->GetValidEnd())); + // The checker should ignore the regular check interval and instead set the next check time based on the tp. + BOOST_CHECK_EQUAL(nextCheck, Convert::ToDouble(period->GetValidEnd())); + } + + // We expect that no checks are executed, and instead the checker reschedules the checks for the + // next valid end time of the "never" time period, which is always 24h from now. So, we should see + // 4 log messages about skipping the checks due to the time period, and nothing else. + BOOST_CHECK_EQUAL(4, testLogger->CountExpectedLogPattern("Skipping check for object .*, as not in check period 'never', until .*")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Checks for checkable .* are disabled. Rescheduling check.")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Executing check for .*")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Check finished for object .*")); + BOOST_CHECK_EQUAL(0, resultCount); +} + +BOOST_AUTO_TEST_CASE(in_check_period) +{ + TimePeriod::Ptr period = new TimePeriod; + period->SetName("24x7"); + period->SetRanges( + new Dictionary{ + {"monday", "00:00-24:00"}, + {"tuesday", "00:00-24:00"}, + {"wednesday", "00:00-24:00"}, + {"thursday", "00:00-24:00"}, + {"friday", "00:00-24:00"}, + {"saturday", "00:00-24:00"}, + {"sunday", "00:00-24:00"} + }, + true + ); + period->SetUpdate(new Function("LegacyTimePeriod", LegacyTimePeriod::ScriptFunc, {"tp", "begin", "end"}), true); + period->Register(); + period->PreActivate(); + period->Activate(); + + // Register some checkables that are only checked during the "always" time period, which is always. + (void)RegisterCheckable("host-1", "random", "always"); + (void)RegisterCheckable("host-2", "random", "always"); + (void)RegisterCheckable("host-3", "sleep", "always"); + (void)RegisterCheckable("host-4", "sleep", "always"); + + SleepFor(.3, true); + + // We expect that checks are executed normally, and the checker sets the next check time based + // on the regular check interval. So, we should see multiple checks executed for each checkable. + BOOST_CHECK(ExpectLogPattern("Executing check for .*")); + BOOST_CHECK(ExpectLogPattern("Check finished for object .*")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Skipping check for object .*: Dependency failed.")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Checks for checkable .* are disabled. Rescheduling check.")); + BOOST_CHECK_MESSAGE(6 <= resultCount && resultCount <= 8, "got=" << resultCount); +} + +BOOST_AUTO_TEST_CASE(max_concurrent_checks) +{ + // Limit the number of concurrent checks to 4. + ScriptGlobal::Set("MaxConcurrentChecks", 4); + + // Register 16 checkables that each sleep for 10 seconds when executing their check. + // With a max concurrent check limit of 4, we should see that only 4 checks are executed + // at the same time, and the remaining 12 checks are queued until one of the running checks + // finishes (which will not happen within the short sleep time of this test). + RegisterCheckablesSleep(16, 10); + Utility::Sleep(.5); + + auto objects(ConfigType::GetObjectsByType()); + BOOST_CHECK_EQUAL(16, objects.size()); + + for (auto& h : objects) { + // Force a reschedule of the checks to see whether the checker does absolutely nothing + // when the max concurrent check limit is reached. Normally, this would force the checker + // to immediately pick up the checkable and execute its check, but since all 4 slots are + // already taken, the checker should just update its queue idx and do nothing else. + Checkable::OnRescheduleCheck(h, Utility::GetTime()); + } + Utility::Sleep(.5); + + // We expect that only 4 checks are started initially, and the other 12 checks should have + // never been run, since the sleep time for each check (10 seconds) is much longer than the + // total sleep time of this test (1 second). + BOOST_CHECK(ExpectLogPattern("Pending checkables: 4; Idle checkables: 12; Checks/s: .*")); + BOOST_CHECK_EQUAL(4, testLogger->CountExpectedLogPattern("Scheduling info for checkable .*: Object .*")); + BOOST_CHECK_EQUAL(4, testLogger->CountExpectedLogPattern("Executing check for .*")); + BOOST_CHECK_EQUAL(4, Checkable::GetPendingChecks()); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Check finished for object .*")); // none finished yet + BOOST_CHECK_EQUAL(0, resultCount); +} + +BOOST_AUTO_TEST_CASE(skipped_remote_checks) +{ + // The check execution for remote checks is skipped if the remote endpoint is not connected, + // not syncing, and we are within the cold startup window (5min after application start). + Application::GetInstance()->SetStartTime(Utility::GetTime()); + + // Set the check and retry intervals to 60 seconds, since it's sufficient to + // just verify that the checks are skipped only once, and not repeatedly. + checkInterval = 60; + retryInterval = 60; + + RegisterRemoteChecks(8); + SleepFor(.3, true); + + for (auto& [host, nextCheck] : nextCheckTimes) { + BOOST_TEST_MESSAGE("Host " << std::quoted(host) << " -> next_check: " << std::fixed << std::setprecision(0) + << nextCheck << " roughly expected: " << Utility::GetTime() + checkInterval); + // Our algorithm for computing the next check time is not too precise, but it should roughly be within 5s of + // the expected next check time based on the check interval. See Checkable::UpdateNextCheck() for details. + BOOST_CHECK_GE(nextCheck, Utility::GetTime() + checkInterval-5); + BOOST_CHECK_LE(nextCheck, Utility::GetTime() + checkInterval); // but not more than the interval + } + + BOOST_CHECK_EQUAL(nullptr, lastResult.load()); // No check results should be received + BOOST_CHECK_EQUAL(8, testLogger->CountExpectedLogPattern("Executing check for .*")); + BOOST_CHECK_EQUAL(8, testLogger->CountExpectedLogPattern("Check finished for object .*")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Skipping check for object .*: Dependency failed.")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Checks for checkable .* are disabled. Rescheduling check.")); + BOOST_CHECK_EQUAL(0, resultCount); +} + +BOOST_AUTO_TEST_CASE(remote_checks_outside_cold_startup) +{ + Application::GetInstance()->SetStartTime(Utility::GetTime()-500); // Simulate being outside cold startup window + + checkInterval = 60; + retryInterval = 60; + + RegisterRemoteChecks(8); + SleepFor(.3, true); + + for (auto& [host, nextCheck] : nextCheckTimes) { + BOOST_TEST_MESSAGE("Host " << std::quoted(host) << " -> next_check: " << std::fixed << std::setprecision(0) + << nextCheck << " roughly expected: " << Utility::GetTime() + checkInterval); + // Our algorithm for computing the next check time is not too precise, but it should roughly be within 5s of + // the expected next check time based on the check interval. See Checkable::UpdateNextCheck() for details. + BOOST_CHECK_GE(nextCheck, Utility::GetTime() + checkInterval-5); + BOOST_CHECK_LE(nextCheck, Utility::GetTime() + checkInterval); // but not more than the interval + } + + BOOST_CHECK_EQUAL(8, testLogger->CountExpectedLogPattern("Executing check for .*")); + BOOST_CHECK_EQUAL(8, testLogger->CountExpectedLogPattern("Check finished for object .*")); + BOOST_CHECK_EQUAL(8, resultCount); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Skipping check for object .*: Dependency failed.")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Checks for checkable .* are disabled. Rescheduling check.")); + + BOOST_REQUIRE(lastResult.load()); // We should now have a cr! + + auto cr = lastResult.load(); + BOOST_CHECK_EQUAL(ServiceUnknown, cr->GetState()); + String expectedOutput = "Remote Icinga instance 'remote-checker' is not connected to '" + Endpoint::GetLocalEndpoint()->GetName() + "'"; + BOOST_CHECK_EQUAL(expectedOutput, cr->GetOutput()); +} + +BOOST_AUTO_TEST_CASE(remote_checks_with_connected_endpoint) +{ + // Register a remote endpoint that is connected, so remote checks can be executed or actually simulate + // sending the check command to the remote endpoint. In this case, we shouldn't also receive any cr, + // but checker should reschedule the check at "now + check_timeout (which is 0) + 30s". + RegisterRemoteChecks(8, true); + SleepFor(.3, true); + + for (auto& [host, nextCheck] : nextCheckTimes) { + // In this case, there shouldn't be any OnNextCheckChanged events triggered, + // thus nextCheck should still be 0.0 as initialized by RegisterCheckable(). + BOOST_CHECK_EQUAL(0.0, nextCheck); + } + + auto hosts(ConfigType::GetObjectsByType()); + BOOST_CHECK_EQUAL(8, hosts.size()); + + for (auto& h : hosts) { + // Verify the next_check time is set to roughly now + 30s, since RegisterCheckable() + // initializes the check_timeout to 0. + BOOST_TEST_MESSAGE("Host " << std::quoted(h->GetName().GetData()) << " -> next_check: " << std::fixed + << std::setprecision(0) << h->GetNextCheck() << " roughly expected: " << Utility::GetTime() + 30); + BOOST_CHECK_GE(h->GetNextCheck(), Utility::GetTime() + 25); + BOOST_CHECK_LE(h->GetNextCheck(), Utility::GetTime() + 30); + } + + BOOST_CHECK_EQUAL(8, testLogger->CountExpectedLogPattern("Sending message 'event::ExecuteCommand' to 'remote-checker'")); + BOOST_CHECK_EQUAL(8, testLogger->CountExpectedLogPattern("Executing check for .*")); + BOOST_CHECK_EQUAL(8, testLogger->CountExpectedLogPattern("Check finished for object .*")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Skipping check for object .*: Dependency failed.")); + BOOST_CHECK_EQUAL(0, testLogger->CountExpectedLogPattern("Checks for checkable .* are disabled. Rescheduling check.")); + BOOST_CHECK_EQUAL(0, resultCount); // No check results should be received yet +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/test/icingaapplication-fixture.cpp b/test/icingaapplication-fixture.cpp index 80fa4bfd8e..32517ab128 100644 --- a/test/icingaapplication-fixture.cpp +++ b/test/icingaapplication-fixture.cpp @@ -8,6 +8,12 @@ static bool IcingaInitialized = false; IcingaApplicationFixture::IcingaApplicationFixture() { + // This limits the number of threads in the thread pool, but does not limit the number of concurrent checks. + // By default, we'll have only 2 threads in the pool, which is too low for some heavy checker tests. So, we + // set the concurrency to the number of hardware threads available on the system and the global thread pool + // size will be the double of that (see threadpool.cpp). + Configuration::Concurrency = std::thread::hardware_concurrency(); + if (!IcingaInitialized) InitIcingaApplication(); }