NETWAYS · ivareri · May 4, 2025 · May 4, 2025 · May 4, 2025 · May 4, 2025
diff --git a/docs/role-elasticsearch.md b/docs/role-elasticsearch.md
@@ -9,6 +9,9 @@ If you use the role to set up security you, can use its CA to create certificate
 
 Please note that setting `elasticsearch_bootstrap_pw` as variable will only take effect when initialising Elasticsearch. Changes after starting elasticsearch for the first time will not change the bootstrap password for the instance and will lead to breaking tests.
 
+The role can perform a rolling upgrade of an elasticsearch cluster. Nodes will be upgraded in inventory order. If you are using data tires, you're ansible inventory should be sorted acording to data tiers.  Set `any_errors_fatal: true` in your playbook to abort if any errors occur during the upgrade.
+
+
 Role Variables
 --------------
 
@@ -54,6 +57,9 @@ This variable activates a workaround to start on systems that have certain harde
 * *elasticsearch_seed_hosts*: Set elasticsearch seed hosts
 * *elasticsearch_security_enrollment*: Controls enrollment (of nodes and Kibana) to a local node that’s been autoconfigured for security.
 
+* *elasticsearch_upgrade_routing_mode*: Set `cluster.routing.allocation.enable` during rolling upgrade. (default: `none`)
+* *elasticsearch_upgrade_stop_ml*: Stop the tasks associated with active machine learning jobs and datafeeds during rolling upgrade. (default: `false`)
+
 The following variable was only integrated to speed up upgrades of non-production clusters. Use with caution and at your own risk:
 
 * *elasticsearch_unsafe_upgrade_restart*: This will still perform rolling upgrades, but will first update the package and then restart the service. In contrast the default behaviour is to stop the service, do the upgrade and then start again. (default: `false`)

diff --git a/molecule/elasticsearch_upgrade/converge.yml b/molecule/elasticsearch_upgrade/converge.yml
@@ -0,0 +1,35 @@
+---
+# The workaround for arbitrarily named role directory is important because the git repo has one name and the role within it another
+# Found at: https://github.com/ansible-community/molecule/issues/1567#issuecomment-436876722
+- name: Converge
+  collections:
+    - netways.elasticstack
+  hosts: all
+  vars:
+    #elasticsearch_security: true # needed for tests of > 7 releases
+    elasticstack_full_stack: false
+    elasticsearch_jna_workaround: true
+    elasticsearch_disable_systemcallfilterchecks: true
+    elasticstack_release: "{{ lookup('env', 'ELASTIC_RELEASE') | int}}"
+    elasticsearch_heap: "1"
+    elasticstack_no_log: false
+  tasks:
+    - name: Include Elastics repos role
+      ansible.builtin.include_role:
+        name: repos
+    - name: Include Elasticsearch
+      ansible.builtin.include_role:
+        name: elasticsearch
+      vars:
+        elasticstack_version: "8.16.0"
+      tags:
+        - molecule-idempotence-notest
+
+    - name: Flush handlers before upgrade
+      meta: flush_handlers
+
+    - name: Include Elasticsearch
+      ansible.builtin.include_role:
+        name: elasticsearch
+      vars:
+        elasticstack_version: "8.17.0"
diff --git a/molecule/elasticsearch_upgrade/molecule.yml b/molecule/elasticsearch_upgrade/molecule.yml
@@ -0,0 +1,32 @@
+---
+dependency:
+  name: galaxy
+  options:
+    requirements-file: requirements.yml
+driver:
+  name: docker
+platforms:
+  - name: elasticsearch_default1
+    groups:
+      - elasticsearch
+    image: "geerlingguy/docker-${MOLECULE_DISTRO:-debian11}-ansible:latest"
+    command: ${MOLECULE_DOCKER_COMMAND:-""}
+    volumes:
+      - /sys/fs/cgroup:/sys/fs/cgroup:rw
+    cgroupns_mode: host
+    privileged: true
+    pre_build_image: true
+  - name: elasticsearch_default2
+    groups:
+      - elasticsearch
+    image: "geerlingguy/docker-${MOLECULE_DISTRO:-debian11}-ansible:latest"
+    command: ${MOLECULE_DOCKER_COMMAND:-""}
+    volumes:
+      - /sys/fs/cgroup:/sys/fs/cgroup:rw
+    cgroupns_mode: host
+    privileged: true
+    pre_build_image: true
+provisioner:
+  name: ansible
+verifier:
+  name: ansible
diff --git a/molecule/elasticsearch_upgrade/prepare.yml b/molecule/elasticsearch_upgrade/prepare.yml
@@ -0,0 +1,15 @@
+---
+- name: Prepare
+  hosts: all
+  tasks:
+    - name: Install packages for Debian
+      ansible.builtin.apt:
+        name:
+          - gpg
+          - gpg-agent
+          - procps
+          - curl
+          - iproute2
+          - git
+          - openssl
+        update_cache: yes
diff --git a/molecule/elasticsearch_upgrade/requirements.yml b/molecule/elasticsearch_upgrade/requirements.yml
@@ -0,0 +1,3 @@
+---
+collections:
+  - community.general
diff --git a/roles/elasticsearch/defaults/main.yml b/roles/elasticsearch/defaults/main.yml
@@ -37,6 +37,10 @@ elasticsearch_cert_expiration_buffer: 30
 elasticsearch_cert_will_expire_soon: false
 elasticsearch_ssl_verification_mode: full
 
+# Upgrade options
+elasticsearch_upgrade_routing_mode: "none"
+elasticsearch_upgrade_stop_ml: false
+
 # use this only for non-prod environments and at your own risk!
 elasticsearch_unsafe_upgrade_restart: false
 

diff --git a/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml b/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml
@@ -54,30 +54,17 @@
   delay: 3
   changed_when: false
 
-- name: Enable shard allocation for the cluster
-  ansible.builtin.uri:
-    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
-    method: PUT
-    body: '{ "persistent": { "cluster.routing.allocation.enable": null }}'
-    body_format: json
-    user: elastic
-    password: "{{ elasticstack_password.stdout }}"
-    validate_certs: no
-  register: response
-  # next line is boolean not string, so no quotes around true
-  # use python truthiness
-  until: "response.json.acknowledged == true"
-  retries: 5
-  delay: 30
+# Don't continue the play unless cluster health is OK
+- name: Cluster health check
+  ansible.builtin.include_tasks: elasticsearch-wait-for-cluster-health.yml
 
-- name: Wait for cluster health to return to yellow or green
+- name: Restart ML jobs
   ansible.builtin.uri:
-    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
-    method: GET
+    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_ml/set_upgrade_mode?enabled=false"
+    method: POST
     user: elastic
     password: "{{ elasticstack_password.stdout }}"
     validate_certs: no
-  register: response
-  until: "response.json.status == 'yellow' or response.json.status == 'green'"
-  retries: 5
-  delay: 30
+  failed_when: false
+  when:
+    - elasticsearch_upgrade_stop_ml | bool
diff --git a/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml b/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml
@@ -15,44 +15,29 @@
   ansible.builtin.set_fact:
     elasticsearch_http_protocol: "https"
 
-# Usually we should not need this step. It's only there to recover from broken upgrade plays
-# Without this step the cluster would never recover and the play would always fail
-- name: Enable shard allocation for the cluster
-  ansible.builtin.uri:
-    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
-    method: PUT
-    body: '{ "persistent": { "cluster.routing.allocation.enable": null }}'
-    body_format: json
-    user: elastic
-    password: "{{ elasticstack_password.stdout }}"
-    validate_certs: no
-  register: response
-  # next line is boolean not string, so no quotes around true
-  # use python truthiness
-  until: "response.json.acknowledged == true"
-  retries: 5
-  delay: 30
+# This step is here primarily in order to recover from broken/restarted upgrade or rolling restart.
+# TODO: Only run this task for the first host.
+- name: Cluster health check
+  ansible.builtin.include_tasks: elasticsearch-wait-for-cluster-health.yml
+
 
-  # this step is key!!!  Don't restart more nodes
-  # until all shards have completed recovery
-- name: Wait for cluster health to return to green
+- name: Stop ML jobs while upgrading
   ansible.builtin.uri:
-    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
-    method: GET
+    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_ml/set_upgrade_mode?enabled=true"
+    method: POST
     user: elastic
     password: "{{ elasticstack_password.stdout }}"
     validate_certs: no
-  register: response
-  until: "response.json.status == 'green'"
-  retries: 50
-  delay: 30
+  failed_when: false
+  run_once: true
+  when:
+    - elasticsearch_upgrade_stop_ml | bool
 
-# Disabling shard allocation right after enabling it seems redundant. Please see above for details.
 - name: Disable shard allocation for the cluster
   ansible.builtin.uri:
     url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
     method: PUT
-    body: '{ "persistent": { "cluster.routing.allocation.enable": "none" }}'
+    body: "{ \"persistent\": { \"cluster.routing.allocation.enable\": \"{{ elasticsearch_upgrade_routing_mode }}\" } }"
     body_format: json
     user: elastic
     password: "{{ elasticstack_password.stdout }}"

diff --git a/roles/elasticsearch/tasks/elasticsearch-rolling-upgrade.yml b/roles/elasticsearch/tasks/elasticsearch-rolling-upgrade.yml
@@ -64,7 +64,6 @@
       notify:
         - Restart Elasticsearch
 
-
 - name: Be careful about upgrade when Elasticsearch is running
   when:
     - elasticsearch_running.status.ActiveState == "active"

diff --git a/roles/elasticsearch/tasks/elasticsearch-wait-for-cluster-health.yml b/roles/elasticsearch/tasks/elasticsearch-wait-for-cluster-health.yml
@@ -0,0 +1,66 @@
+---
+
+# Make sure shard allocation is enabled
+- name: Enable shard allocation for the cluster
+  ansible.builtin.uri:
+    url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
+    method: PUT
+    body: '{ "persistent": { "cluster.routing.allocation.enable": null }}'
+    body_format: json
+    user: elastic
+    password: "{{ elasticstack_password.stdout }}"
+    validate_certs: no
+  register: response
+  # next line is boolean not string, so no quotes around true
+  # use python truthiness
+  until: "response.json.acknowledged == true"
+  retries: 5
+  delay: 30
+
+# From https://www.elastic.co/guide/en/elastic-stack/8.17/upgrading-elasticsearch.html
+## During a rolling upgrade, primary shards assigned to a node running the new version cannot have their replicas assigned to a node with the old version. The new version might have a different data format that is not understood by the old version.
+##
+## If it is not possible to assign the replica shards to another node (there is only one upgraded node in the cluster), the replica shards remain unassigned and status stays yellow.
+##
+## In this case, you can proceed once there are no initializing or relocating shards (check the init and relo columns).
+
+- name: Check cluster health
+  block:
+  - name: Wait for cluster health to return to green
+    ansible.builtin.uri:
+      url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
+      method: GET
+      user: elastic
+      password: "{{ elasticstack_password.stdout }}"
+      validate_certs: no
+    register: response
+    until: "response.json.status == 'green'"
+    retries: 50
+    delay: 30
+
+  # Timed out while waiting for green cluster
+  # Check if we can continue with a yellow cluster
+  rescue:
+    - name: "Rescue: Check if cluster health is yellow"
+      ansible.builtin.uri:
+        url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
+        method: GET
+        user: elastic
+        password: "{{ elasticstack_password.stdout }}"
+        validate_certs: no
+      register: response
+      failed_when: "response.json.status != 'yellow' or response.json.relocating_shards != 0 or response.json.initializing_shards != 0"
+
+    - name: "Rescure: Wait before verifying status"
+      ansible.builtin.pause:
+        seconds: 10
+
+    - name: "Rescue: Verify we can safely continue with yellow cluster"
+      ansible.builtin.uri:
+        url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
+        method: GET
+        user: elastic
+        password: "{{ elasticstack_password.stdout }}"
+        validate_certs: no
+      register: response
+      failed_when: "response.json.status != 'yellow' or response.json.relocating_shards != 0 or response.json.initializing_shards != 0"
-Original file line number
+Diff line change
@@ Expand Up / @@ -64,7 +64,6 @@ @@
           notify:
             - Restart Elasticsearch
     - name: Be careful about upgrade when Elasticsearch is running
       when:
         - elasticsearch_running.status.ActiveState == "active"
@@ Expand Down @@