From 3f9acedb2da6392a2f70b61c5e11d3bc6bb8aafc Mon Sep 17 00:00:00 2001 From: Mat Kowalski Date: Tue, 22 Oct 2024 12:13:19 +0200 Subject: [PATCH] OPNET-569: Do not run resolv-prepender from NM dispatcher This PR changes the logic of how NetworkManager communicates changes in the environment and how they are picked by on-prem-resolv-prepender. Previously the NM dispatcher script had a logic that would trigger a systemd service (either to start it or to restart). This proven to be problematic, prone to race conditions and in principle a complicated design. Now we are moving to a model where systemd on its own will decide what and when to restart, in our case by leveraging the systemd path units. NM dispatcher is now responsible only for writing a correct environment file (we need that in case DNS search domains change). Systemd path unit observes the file and if a change is detected, it will trigger whatever is necessary. There is other stuff in the NM dispatcher script that we are keeping as it is out of scope of this refactor. Closes: OPNET-569 --- .../NetworkManager-resolv-prepender.yaml | 20 +++++++++---------- .../units/on-prem-resolv-prepender.path.yaml | 1 + 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/templates/common/on-prem/files/NetworkManager-resolv-prepender.yaml b/templates/common/on-prem/files/NetworkManager-resolv-prepender.yaml index 4329c51ab7..8ebbdb5d3c 100644 --- a/templates/common/on-prem/files/NetworkManager-resolv-prepender.yaml +++ b/templates/common/on-prem/files/NetworkManager-resolv-prepender.yaml @@ -12,17 +12,11 @@ contents: mkdir -p /run/resolv-prepender echo "IP4_DOMAINS=$IP4_DOMAINS" > /run/resolv-prepender/env.new echo "IP6_DOMAINS=$IP6_DOMAINS" >> /run/resolv-prepender/env.new - # If we changed the environment, we should restart the service to pick up the - # new values. However, if the image hasn't been pulled successfully yet we can't - # restart the service or we may interrupt the pull and end up with a corrupt image. - # We're better off with incorrect search domains for a while than wedging the - # system with a bad image. - if ! diff -q /run/resolv-prepender/env /run/resolv-prepender/env.new && /usr/bin/podman image exists "{{ .Images.baremetalRuntimeCfgImage }}"; then - >&2 echo "NM resolv-prepender: Environment variable(s) changed. Restarting service." - systemctl is-active on-prem-resolv-prepender && systemctl kill on-prem-resolv-prepender + + if ! diff -q /run/resolv-prepender/env /run/resolv-prepender/env.new; then + >&2 echo "NM resolv-prepender: Environment variable(s) changed. Systemd path unit will pick the change." + mv -f /run/resolv-prepender/env.new /run/resolv-prepender/env fi - mv -f /run/resolv-prepender/env.new /run/resolv-prepender/env - systemctl start --no-block on-prem-resolv-prepender } export IP4_DOMAINS IP6_DOMAINS @@ -34,7 +28,8 @@ contents: # subsequent up&co. events as undesired. # # Given an overall Network Manager dispatcher timeout of 90 seconds, we are enforcing a slightly shorter - # timeout for the observed events. + # timeout for the observed events. It is not really required because all we do in this function is to write a file. + # We could get rid of this timeout completely, but it also does not cost much to keep it. case "$STATUS" in dns-change) >&2 echo "NM resolv-prepender triggered by ${IFACE} ${STATUS}." @@ -46,6 +41,7 @@ contents: exit 1 fi ;; + up|dhcp4-change|dhcp6-change|reapply) if [ ! -f "/run/networkmanager-dns-event-detected" ]; then >&2 echo "NM resolv-prepender triggered by ${IFACE} ${STATUS}." @@ -54,6 +50,7 @@ contents: exit 1 fi fi + # If $DHCP6_FQDN_FQDN is not empty and is not localhost.localdomain and static hostname was not already set if [[ -n "$DHCP6_FQDN_FQDN" && "$DHCP6_FQDN_FQDN" != "localhost.localdomain" && "$DHCP6_FQDN_FQDN" =~ "." ]] ; then STATIC_HOSTNAME="$(test ! -e /etc/hostname && echo -n || cat /etc/hostname | xargs)" @@ -64,6 +61,7 @@ contents: hostnamectl set-hostname --static --transient $DHCP6_FQDN_FQDN fi fi + if [[ "$STATUS" == "up" ]] && [[ $IFACE == "br-ex" ]]; then touch /run/nodeip-configuration/br-ex-up fi diff --git a/templates/common/on-prem/units/on-prem-resolv-prepender.path.yaml b/templates/common/on-prem/units/on-prem-resolv-prepender.path.yaml index fcfcadb357..4c5b20fa47 100644 --- a/templates/common/on-prem/units/on-prem-resolv-prepender.path.yaml +++ b/templates/common/on-prem/units/on-prem-resolv-prepender.path.yaml @@ -5,6 +5,7 @@ contents: | Description=Watches for changes in /var/run/NetworkManager/resolv.conf according to on-prem IPI needs [Path] PathModified=/var/run/NetworkManager/resolv.conf + PathModified=/run/resolv-prepender/env Unit=on-prem-resolv-prepender.service [Install] WantedBy=multi-user.target