diff --git a/docs/role-elasticsearch.md b/docs/role-elasticsearch.md index 693dac09..d5f5fe89 100644 --- a/docs/role-elasticsearch.md +++ b/docs/role-elasticsearch.md @@ -9,6 +9,9 @@ If you use the role to set up security you, can use its CA to create certificate Please note that setting `elasticsearch_bootstrap_pw` as variable will only take effect when initialising Elasticsearch. Changes after starting elasticsearch for the first time will not change the bootstrap password for the instance and will lead to breaking tests. +The role can perform a rolling upgrade of an elasticsearch cluster. Nodes will be upgraded in inventory order. If you are using data tires, you're ansible inventory should be sorted acording to data tiers. Set `any_errors_fatal: true` in your playbook to abort if any errors occur during the upgrade. + + Role Variables -------------- @@ -54,6 +57,9 @@ This variable activates a workaround to start on systems that have certain harde * *elasticsearch_seed_hosts*: Set elasticsearch seed hosts * *elasticsearch_security_enrollment*: Controls enrollment (of nodes and Kibana) to a local node that’s been autoconfigured for security. +* *elasticsearch_upgrade_routing_mode*: Set `cluster.routing.allocation.enable` during rolling upgrade. (default: `none`) +* *elasticsearch_upgrade_stop_ml*: Stop the tasks associated with active machine learning jobs and datafeeds during rolling upgrade. (default: `false`) + The following variable was only integrated to speed up upgrades of non-production clusters. Use with caution and at your own risk: * *elasticsearch_unsafe_upgrade_restart*: This will still perform rolling upgrades, but will first update the package and then restart the service. In contrast the default behaviour is to stop the service, do the upgrade and then start again. (default: `false`) diff --git a/molecule/elasticsearch_upgrade/converge.yml b/molecule/elasticsearch_upgrade/converge.yml new file mode 100644 index 00000000..68a1f8fa --- /dev/null +++ b/molecule/elasticsearch_upgrade/converge.yml @@ -0,0 +1,35 @@ +--- +# The workaround for arbitrarily named role directory is important because the git repo has one name and the role within it another +# Found at: https://github.com/ansible-community/molecule/issues/1567#issuecomment-436876722 +- name: Converge + collections: + - netways.elasticstack + hosts: all + vars: + #elasticsearch_security: true # needed for tests of > 7 releases + elasticstack_full_stack: false + elasticsearch_jna_workaround: true + elasticsearch_disable_systemcallfilterchecks: true + elasticstack_release: "{{ lookup('env', 'ELASTIC_RELEASE') | int}}" + elasticsearch_heap: "1" + elasticstack_no_log: false + tasks: + - name: Include Elastics repos role + ansible.builtin.include_role: + name: repos + - name: Include Elasticsearch + ansible.builtin.include_role: + name: elasticsearch + vars: + elasticstack_version: "8.16.0" + tags: + - molecule-idempotence-notest + + - name: Flush handlers before upgrade + meta: flush_handlers + + - name: Include Elasticsearch + ansible.builtin.include_role: + name: elasticsearch + vars: + elasticstack_version: "8.17.0" diff --git a/molecule/elasticsearch_upgrade/molecule.yml b/molecule/elasticsearch_upgrade/molecule.yml new file mode 100644 index 00000000..7c2c71b7 --- /dev/null +++ b/molecule/elasticsearch_upgrade/molecule.yml @@ -0,0 +1,32 @@ +--- +dependency: + name: galaxy + options: + requirements-file: requirements.yml +driver: + name: docker +platforms: + - name: elasticsearch_default1 + groups: + - elasticsearch + image: "geerlingguy/docker-${MOLECULE_DISTRO:-debian11}-ansible:latest" + command: ${MOLECULE_DOCKER_COMMAND:-""} + volumes: + - /sys/fs/cgroup:/sys/fs/cgroup:rw + cgroupns_mode: host + privileged: true + pre_build_image: true + - name: elasticsearch_default2 + groups: + - elasticsearch + image: "geerlingguy/docker-${MOLECULE_DISTRO:-debian11}-ansible:latest" + command: ${MOLECULE_DOCKER_COMMAND:-""} + volumes: + - /sys/fs/cgroup:/sys/fs/cgroup:rw + cgroupns_mode: host + privileged: true + pre_build_image: true +provisioner: + name: ansible +verifier: + name: ansible diff --git a/molecule/elasticsearch_upgrade/prepare.yml b/molecule/elasticsearch_upgrade/prepare.yml new file mode 100644 index 00000000..cf162407 --- /dev/null +++ b/molecule/elasticsearch_upgrade/prepare.yml @@ -0,0 +1,15 @@ +--- +- name: Prepare + hosts: all + tasks: + - name: Install packages for Debian + ansible.builtin.apt: + name: + - gpg + - gpg-agent + - procps + - curl + - iproute2 + - git + - openssl + update_cache: yes diff --git a/molecule/elasticsearch_upgrade/requirements.yml b/molecule/elasticsearch_upgrade/requirements.yml new file mode 100644 index 00000000..8dd51618 --- /dev/null +++ b/molecule/elasticsearch_upgrade/requirements.yml @@ -0,0 +1,3 @@ +--- +collections: + - community.general diff --git a/roles/elasticsearch/defaults/main.yml b/roles/elasticsearch/defaults/main.yml index 29aaa0c6..2303c4aa 100644 --- a/roles/elasticsearch/defaults/main.yml +++ b/roles/elasticsearch/defaults/main.yml @@ -37,6 +37,10 @@ elasticsearch_cert_expiration_buffer: 30 elasticsearch_cert_will_expire_soon: false elasticsearch_ssl_verification_mode: full +# Upgrade options +elasticsearch_upgrade_routing_mode: "none" +elasticsearch_upgrade_stop_ml: false + # use this only for non-prod environments and at your own risk! elasticsearch_unsafe_upgrade_restart: false diff --git a/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml b/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml index 8223cb74..283cc882 100644 --- a/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml +++ b/roles/elasticsearch/tasks/elasticsearch-rolling-start.yml @@ -54,30 +54,17 @@ delay: 3 changed_when: false -- name: Enable shard allocation for the cluster - ansible.builtin.uri: - url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings" - method: PUT - body: '{ "persistent": { "cluster.routing.allocation.enable": null }}' - body_format: json - user: elastic - password: "{{ elasticstack_password.stdout }}" - validate_certs: no - register: response - # next line is boolean not string, so no quotes around true - # use python truthiness - until: "response.json.acknowledged == true" - retries: 5 - delay: 30 +# Don't continue the play unless cluster health is OK +- name: Cluster health check + ansible.builtin.include_tasks: elasticsearch-wait-for-cluster-health.yml -- name: Wait for cluster health to return to yellow or green +- name: Restart ML jobs ansible.builtin.uri: - url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health" - method: GET + url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_ml/set_upgrade_mode?enabled=false" + method: POST user: elastic password: "{{ elasticstack_password.stdout }}" validate_certs: no - register: response - until: "response.json.status == 'yellow' or response.json.status == 'green'" - retries: 5 - delay: 30 + failed_when: false + when: + - elasticsearch_upgrade_stop_ml | bool diff --git a/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml b/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml index fe7a0948..3bf3719d 100644 --- a/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml +++ b/roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml @@ -15,44 +15,29 @@ ansible.builtin.set_fact: elasticsearch_http_protocol: "https" -# Usually we should not need this step. It's only there to recover from broken upgrade plays -# Without this step the cluster would never recover and the play would always fail -- name: Enable shard allocation for the cluster - ansible.builtin.uri: - url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings" - method: PUT - body: '{ "persistent": { "cluster.routing.allocation.enable": null }}' - body_format: json - user: elastic - password: "{{ elasticstack_password.stdout }}" - validate_certs: no - register: response - # next line is boolean not string, so no quotes around true - # use python truthiness - until: "response.json.acknowledged == true" - retries: 5 - delay: 30 +# This step is here primarily in order to recover from broken/restarted upgrade or rolling restart. +# TODO: Only run this task for the first host. +- name: Cluster health check + ansible.builtin.include_tasks: elasticsearch-wait-for-cluster-health.yml + - # this step is key!!! Don't restart more nodes - # until all shards have completed recovery -- name: Wait for cluster health to return to green +- name: Stop ML jobs while upgrading ansible.builtin.uri: - url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health" - method: GET + url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_ml/set_upgrade_mode?enabled=true" + method: POST user: elastic password: "{{ elasticstack_password.stdout }}" validate_certs: no - register: response - until: "response.json.status == 'green'" - retries: 50 - delay: 30 + failed_when: false + run_once: true + when: + - elasticsearch_upgrade_stop_ml | bool -# Disabling shard allocation right after enabling it seems redundant. Please see above for details. - name: Disable shard allocation for the cluster ansible.builtin.uri: url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings" method: PUT - body: '{ "persistent": { "cluster.routing.allocation.enable": "none" }}' + body: "{ \"persistent\": { \"cluster.routing.allocation.enable\": \"{{ elasticsearch_upgrade_routing_mode }}\" } }" body_format: json user: elastic password: "{{ elasticstack_password.stdout }}" diff --git a/roles/elasticsearch/tasks/elasticsearch-rolling-upgrade.yml b/roles/elasticsearch/tasks/elasticsearch-rolling-upgrade.yml index ab319c07..d8d90e9e 100644 --- a/roles/elasticsearch/tasks/elasticsearch-rolling-upgrade.yml +++ b/roles/elasticsearch/tasks/elasticsearch-rolling-upgrade.yml @@ -64,7 +64,6 @@ notify: - Restart Elasticsearch - - name: Be careful about upgrade when Elasticsearch is running when: - elasticsearch_running.status.ActiveState == "active" diff --git a/roles/elasticsearch/tasks/elasticsearch-wait-for-cluster-health.yml b/roles/elasticsearch/tasks/elasticsearch-wait-for-cluster-health.yml new file mode 100644 index 00000000..ebb7cf6c --- /dev/null +++ b/roles/elasticsearch/tasks/elasticsearch-wait-for-cluster-health.yml @@ -0,0 +1,66 @@ +--- + +# Make sure shard allocation is enabled +- name: Enable shard allocation for the cluster + ansible.builtin.uri: + url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings" + method: PUT + body: '{ "persistent": { "cluster.routing.allocation.enable": null }}' + body_format: json + user: elastic + password: "{{ elasticstack_password.stdout }}" + validate_certs: no + register: response + # next line is boolean not string, so no quotes around true + # use python truthiness + until: "response.json.acknowledged == true" + retries: 5 + delay: 30 + +# From https://www.elastic.co/guide/en/elastic-stack/8.17/upgrading-elasticsearch.html +## During a rolling upgrade, primary shards assigned to a node running the new version cannot have their replicas assigned to a node with the old version. The new version might have a different data format that is not understood by the old version. +## +## If it is not possible to assign the replica shards to another node (there is only one upgraded node in the cluster), the replica shards remain unassigned and status stays yellow. +## +## In this case, you can proceed once there are no initializing or relocating shards (check the init and relo columns). + +- name: Check cluster health + block: + - name: Wait for cluster health to return to green + ansible.builtin.uri: + url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health" + method: GET + user: elastic + password: "{{ elasticstack_password.stdout }}" + validate_certs: no + register: response + until: "response.json.status == 'green'" + retries: 50 + delay: 30 + + # Timed out while waiting for green cluster + # Check if we can continue with a yellow cluster + rescue: + - name: "Rescue: Check if cluster health is yellow" + ansible.builtin.uri: + url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health" + method: GET + user: elastic + password: "{{ elasticstack_password.stdout }}" + validate_certs: no + register: response + failed_when: "response.json.status != 'yellow' or response.json.relocating_shards != 0 or response.json.initializing_shards != 0" + + - name: "Rescure: Wait before verifying status" + ansible.builtin.pause: + seconds: 10 + + - name: "Rescue: Verify we can safely continue with yellow cluster" + ansible.builtin.uri: + url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health" + method: GET + user: elastic + password: "{{ elasticstack_password.stdout }}" + validate_certs: no + register: response + failed_when: "response.json.status != 'yellow' or response.json.relocating_shards != 0 or response.json.initializing_shards != 0"