Skip to content

Commit 5adc31b

Browse files
Avoid repeating tasks in multiple files. Make sure cluster health is OK before play continues from last node
1 parent 9a4be0d commit 5adc31b

File tree

3 files changed

+73
-103
lines changed

3 files changed

+73
-103
lines changed

roles/elasticsearch/tasks/elasticsearch-rolling-start.yml

+3-27
Original file line numberDiff line numberDiff line change
@@ -52,30 +52,6 @@
5252
node_found: "{{ response.json | json_query(node_query) | length > 0 }}"
5353
node_query: "[?name=='{{ elasticsearch_nodename }}']"
5454

55-
- name: Enable shard allocation for the cluster
56-
ansible.builtin.uri:
57-
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
58-
method: PUT
59-
body: '{ "persistent": { "cluster.routing.allocation.enable": null }}'
60-
body_format: json
61-
user: elastic
62-
password: "{{ elasticstack_password.stdout }}"
63-
validate_certs: no
64-
register: response
65-
# next line is boolean not string, so no quotes around true
66-
# use python truthiness
67-
until: "response.json.acknowledged == true"
68-
retries: 5
69-
delay: 30
70-
71-
- name: Wait for cluster health to return to yellow or green
72-
ansible.builtin.uri:
73-
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
74-
method: GET
75-
user: elastic
76-
password: "{{ elasticstack_password.stdout }}"
77-
validate_certs: no
78-
register: response
79-
until: "response.json.status == 'yellow' or response.json.status == 'green'"
80-
retries: 200
81-
delay: 30
55+
# Don't continue the play unless cluster health is OK
56+
- name: Cluster health check
57+
ansible.builtin.include_tasks: elasticsearch-wait-for-cluster-health.yml

roles/elasticsearch/tasks/elasticsearch-rolling-stop.yml

+4-76
Original file line numberDiff line numberDiff line change
@@ -15,82 +15,10 @@
1515
ansible.builtin.set_fact:
1616
elasticsearch_http_protocol: "https"
1717

18-
# Usually we should not need this step. It's only there to recover from broken upgrade plays
19-
# Without this step the cluster would never recover and the play would always fail
20-
- name: Enable shard allocation for the cluster
21-
ansible.builtin.uri:
22-
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
23-
method: PUT
24-
body: '{ "persistent": { "cluster.routing.allocation.enable": null }}'
25-
body_format: json
26-
user: elastic
27-
password: "{{ elasticstack_password.stdout }}"
28-
validate_certs: no
29-
register: response
30-
# next line is boolean not string, so no quotes around true
31-
# use python truthiness
32-
until: "response.json.acknowledged == true"
33-
retries: 5
34-
delay: 30
35-
36-
37-
#
38-
# Start cluster health check
39-
#
40-
41-
# this step is key!!! Don't restart more nodes until we can safely do so. This either requires a green cluster status, or a yellow status with 0 initializing or relocating shards
42-
#
43-
# From https://www.elastic.co/guide/en/elastic-stack/8.17/upgrading-elasticsearch.html
44-
## During a rolling upgrade, primary shards assigned to a node running the new version cannot have their replicas assigned to a node with the old version. The new version might have a different data format that is not understood by the old version.
45-
##
46-
## If it is not possible to assign the replica shards to another node (there is only one upgraded node in the cluster), the replica shards remain unassigned and status stays yellow.
47-
##
48-
## In this case, you can proceed once there are no initializing or relocating shards (check the init and relo columns).
49-
50-
- name: Check cluster health
51-
block:
52-
- name: Wait for cluster health to return to green
53-
ansible.builtin.uri:
54-
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
55-
method: GET
56-
user: elastic
57-
password: "{{ elasticstack_password.stdout }}"
58-
validate_certs: no
59-
register: response
60-
until: "response.json.status == 'green'"
61-
retries: 50
62-
delay: 30
63-
64-
# Timed out while waiting for green cluster
65-
# Check if we can continue with a yellow cluster
66-
rescue:
67-
- name: "Rescue: Check if cluster health is yellow"
68-
ansible.builtin.uri:
69-
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
70-
method: GET
71-
user: elastic
72-
password: "{{ elasticstack_password.stdout }}"
73-
validate_certs: no
74-
register: response
75-
failed_when: "response.json.status != 'yellow' or response.json.relocating_shards != 0 or response.json.initializing_shards != 0"
76-
77-
- name: "Rescure: Wait before verifying status"
78-
ansible.builtin.pause:
79-
seconds: 10
80-
81-
- name: "Rescue: Verify we can safely continue with yellow cluster"
82-
ansible.builtin.uri:
83-
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
84-
method: GET
85-
user: elastic
86-
password: "{{ elasticstack_password.stdout }}"
87-
validate_certs: no
88-
register: response
89-
failed_when: "response.json.status != 'yellow' or response.json.relocating_shards != 0 or response.json.initializing_shards != 0"
90-
91-
#
92-
# End cluster health check
93-
#
18+
# This step is here primarily in order to recover from broken/restarted upgrade or rolling restart.
19+
# TODO: Only run this task for the first host.
20+
- name: Cluster health check
21+
ansible.builtin.include_tasks: elasticsearch-wait-for-cluster-health.yml
9422

9523

9624
# Disabling shard allocation right after enabling it seems redundant. Please see above for details.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
---
2+
3+
# Make sure shard allocation is enabled
4+
- name: Enable shard allocation for the cluster
5+
ansible.builtin.uri:
6+
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/settings"
7+
method: PUT
8+
body: '{ "persistent": { "cluster.routing.allocation.enable": null }}'
9+
body_format: json
10+
user: elastic
11+
password: "{{ elasticstack_password.stdout }}"
12+
validate_certs: no
13+
register: response
14+
# next line is boolean not string, so no quotes around true
15+
# use python truthiness
16+
until: "response.json.acknowledged == true"
17+
retries: 5
18+
delay: 30
19+
20+
# From https://www.elastic.co/guide/en/elastic-stack/8.17/upgrading-elasticsearch.html
21+
## During a rolling upgrade, primary shards assigned to a node running the new version cannot have their replicas assigned to a node with the old version. The new version might have a different data format that is not understood by the old version.
22+
##
23+
## If it is not possible to assign the replica shards to another node (there is only one upgraded node in the cluster), the replica shards remain unassigned and status stays yellow.
24+
##
25+
## In this case, you can proceed once there are no initializing or relocating shards (check the init and relo columns).
26+
27+
- name: Check cluster health
28+
block:
29+
- name: Wait for cluster health to return to green
30+
ansible.builtin.uri:
31+
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
32+
method: GET
33+
user: elastic
34+
password: "{{ elasticstack_password.stdout }}"
35+
validate_certs: no
36+
register: response
37+
until: "response.json.status == 'green'"
38+
retries: 50
39+
delay: 30
40+
41+
# Timed out while waiting for green cluster
42+
# Check if we can continue with a yellow cluster
43+
rescue:
44+
- name: "Rescue: Check if cluster health is yellow"
45+
ansible.builtin.uri:
46+
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
47+
method: GET
48+
user: elastic
49+
password: "{{ elasticstack_password.stdout }}"
50+
validate_certs: no
51+
register: response
52+
failed_when: "response.json.status != 'yellow' or response.json.relocating_shards != 0 or response.json.initializing_shards != 0"
53+
54+
- name: "Rescure: Wait before verifying status"
55+
ansible.builtin.pause:
56+
seconds: 10
57+
58+
- name: "Rescue: Verify we can safely continue with yellow cluster"
59+
ansible.builtin.uri:
60+
url: "{{ elasticsearch_http_protocol }}://{{ elasticsearch_api_host }}:{{ elasticstack_elasticsearch_http_port }}/_cluster/health"
61+
method: GET
62+
user: elastic
63+
password: "{{ elasticstack_password.stdout }}"
64+
validate_certs: no
65+
register: response
66+
failed_when: "response.json.status != 'yellow' or response.json.relocating_shards != 0 or response.json.initializing_shards != 0"

0 commit comments

Comments
 (0)