Skip to content

Commit fc73a5d

Browse files
authored
Merge pull request #153 from stackhpc/maintenance
Add enter_maintenance and exit_maintenance roles
2 parents 72c61b0 + a306c56 commit fc73a5d

File tree

13 files changed

+191
-2
lines changed

13 files changed

+191
-2
lines changed

README.md

+2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ Roles:
1515
* [commands](roles/commands/README.md) for running arbitrary commands
1616
* [crush_rules](roles/crush_rules/README.md) for defining CRUSH rules
1717
* [ec_profiles](roles/ec_profiles/README.md) for defining EC profiles
18+
* [enter_maintenance](roles/enter_maintenance/README.md) for placing hosts into maintenance
19+
* [exit_maintenance](roles/exit_maintenance/README.md) for removing hosts from maintenance
1820
* [keys](roles/keys/README.md) for defining auth keys
1921
* [pools](roles/pools/README.md) for defining pools
2022

galaxy.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
namespace: "stackhpc"
22
name: "cephadm"
3-
version: "1.17.0"
3+
version: "1.18.0"
44
readme: "README.md"
55
authors:
66
- "Michal Nasiadka"

roles/commands/README.md

+6
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,9 @@ with at least one host in it - see the `cephadm` role for more details.
3333
- "fs new cephfs cephfs_metadata cephfs_data"
3434
- "orch apply mds cephfs --placement 3"
3535
```
36+
37+
* `cephadm_commands_until` A expression to evaluate to allow retrying commands. May reference the registered result variable, `cephadm_commands_result`. Default is `true` (do not use retries).
38+
39+
* `cephadm_commands_retries`: Number of retries to use with `cephadm_commands_until`. Default is 0.
40+
41+
* `cephadm_commands_delay`: Delay between retries with `cephadm_commands_until`. Default is 0.

roles/commands/defaults/main.yml

+3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
11
---
22
cephadm_command: ceph
33
cephadm_commands: []
4+
cephadm_commands_until: true
5+
cephadm_commands_retries: 0
6+
cephadm_commands_delay: 0

roles/commands/tasks/main.yml

+7-1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77
become: true
88
changed_when: true
99
when: cephadm_commands | length > 0
10-
1110
delegate_to: "{{ groups['mons'][0] }}"
1211
run_once: true
12+
until: cephadm_commands_until
13+
retries: "{{ cephadm_commands_retries }}"
14+
delay: "{{ cephadm_commands_delay }}"
15+
vars:
16+
# NOTE: Without this, the delegate hosts's ansible_host variable will not
17+
# be respected.
18+
ansible_host: "{{ hostvars[groups['mons'][0]].ansible_host | default(inventory_hostname) }}"

roles/enter_maintenance/README.md

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# enter_maintenance
2+
3+
This role places Ceph hosts into maintenance mode using `cephadm`.
4+
5+
## Prerequisites
6+
7+
This role should be executed on one host at a time. This can be achieved by
8+
adding `serial: 1` to a play.
9+
10+
### Host prerequisites
11+
12+
* The role assumes target hosts connection over SSH with user that has passwordless sudo configured.
13+
* Either direct Internet access or private registry with desired Ceph image accessible to all hosts is required.
14+
15+
### Inventory
16+
17+
This role assumes the existence of the following groups:
18+
19+
* `mons`
20+
21+
with at least one host in it - see the `cephadm` role for more details.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
---
2+
cephadm_hostname: "{{ ansible_facts.nodename }}"
+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
---
2+
- name: Check if host can enter maintenance mode
3+
ansible.builtin.include_role:
4+
name: stackhpc.cephadm.commands
5+
vars:
6+
cephadm_commands:
7+
- "orch host ok-to-stop {{ cephadm_hostname }}"
8+
9+
# Annoyingly, 'ceph orch host ok-to-stop' does not exit non-zero when
10+
# it is not OK to stop, so we need to check for specific messages.
11+
- name: Assert that it is safe to stop host
12+
ansible.builtin.assert:
13+
that:
14+
# This one is seen for monitors
15+
- "'It is NOT safe' not in cephadm_commands_result.results[0].stderr"
16+
# This one is seen for OSDs
17+
- "'unsafe to stop' not in cephadm_commands_result.results[0].stderr"
18+
fail_msg: "{{ cephadm_commands_result.results[0].stderr }}"
19+
20+
- name: Fail over Ceph manager
21+
ansible.builtin.include_role:
22+
name: stackhpc.cephadm.commands
23+
vars:
24+
cephadm_commands:
25+
- "mgr fail"
26+
when: '"Cannot stop active Mgr daemon" in cephadm_commands_result.results[0].stderr'
27+
28+
# RADOS Gateway services prevent a host from entering maintenance.
29+
# Remove the rgw label from the host and wait for Ceph orchestrator to remove
30+
# the service from the host.
31+
- name: Stop RADOS Gateway service
32+
when: "'rgws' in group_names"
33+
block:
34+
- name: Ensure rgw label has been removed from node
35+
ansible.builtin.include_role:
36+
name: stackhpc.cephadm.commands
37+
vars:
38+
cephadm_commands:
39+
- "orch host label rm {{ cephadm_hostname }} rgw"
40+
41+
- name: Wait for RADOS Gateway service to stop
42+
ansible.builtin.include_role:
43+
name: stackhpc.cephadm.commands
44+
vars:
45+
cephadm_commands:
46+
- "orch ls rgw --format json-pretty"
47+
cephadm_commands_until: >-
48+
{{ (cephadm_commands_result.stdout | from_json)[0].status.running ==
49+
(cephadm_commands_result.stdout | from_json)[0].status.size }}
50+
cephadm_commands_retries: 30
51+
cephadm_commands_delay: 10
52+
53+
- name: Ensure host is in maintenance mode
54+
block:
55+
- name: Ensure host is in maintenance mode
56+
ansible.builtin.include_role:
57+
name: stackhpc.cephadm.commands
58+
vars:
59+
cephadm_commands:
60+
- "orch host maintenance enter {{ cephadm_hostname }}"
61+
always:
62+
- name: Ensure rgw label has been added to node
63+
ansible.builtin.include_role:
64+
name: stackhpc.cephadm.commands
65+
vars:
66+
cephadm_commands:
67+
- "orch host label add {{ cephadm_hostname }} rgw"
68+
when: "'rgws' in group_names"
+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
---
2+
- name: Assert that execution is serialised
3+
ansible.builtin.assert:
4+
that:
5+
- ansible_play_batch | length == 1
6+
fail_msg: >-
7+
Hosts must be placed into maintenance one at a time in order to first check
8+
whether it is safe to stop them.
9+
10+
- name: List hosts in maintenance
11+
ansible.builtin.include_role:
12+
name: stackhpc.cephadm.commands
13+
vars:
14+
cephadm_commands:
15+
- "orch host ls --format json-pretty --host_status maintenance"
16+
17+
# Entering maintenance fails if the host is already in maintenance.
18+
- name: Enter maintenance
19+
ansible.builtin.include_tasks: enter.yml
20+
when: cephadm_hostname not in cephadm_hosts_in_maintenance
21+
vars:
22+
cephadm_hosts_in_maintenance: >-
23+
{{ cephadm_commands_result.results[0].stdout |
24+
from_json |
25+
map(attribute='hostname') |
26+
list }}

roles/exit_maintenance/README.md

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# exit_maintenance
2+
3+
This role removes Ceph hosts from maintenance mode using `cephadm`.
4+
5+
## Prerequisites
6+
7+
This role should be executed on one host at a time. This can be achieved by
8+
adding `serial: 1` to a play.
9+
10+
### Host prerequisites
11+
12+
* The role assumes target hosts connection over SSH with user that has passwordless sudo configured.
13+
* Either direct Internet access or private registry with desired Ceph image accessible to all hosts is required.
14+
15+
### Inventory
16+
17+
This role assumes the existence of the following groups:
18+
19+
* `mons`
20+
21+
with at least one host in it - see the `cephadm` role for more details.
+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
---
2+
cephadm_hostname: "{{ ansible_facts.nodename }}"

roles/exit_maintenance/tasks/exit.yml

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
- name: Ensure host has exited maintenance mode
3+
ansible.builtin.include_role:
4+
name: stackhpc.cephadm.commands
5+
vars:
6+
cephadm_commands:
7+
- "orch host maintenance exit {{ cephadm_hostname }}"

roles/exit_maintenance/tasks/main.yml

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
---
2+
- name: Assert that execution is serialised
3+
ansible.builtin.assert:
4+
that:
5+
- ansible_play_batch | length == 1
6+
fail_msg: >-
7+
Hosts must be removed from maintenance one at a time.
8+
9+
- name: List hosts
10+
ansible.builtin.include_role:
11+
name: stackhpc.cephadm.commands
12+
vars:
13+
cephadm_commands:
14+
- "orch host ls --format json-pretty"
15+
16+
# Exiting maintenance fails if the host is not in maintenance or offline.
17+
- name: Exit maintenance
18+
ansible.builtin.include_tasks: exit.yml
19+
when: cephadm_host_status.status | lower in ["maintenance", "offline"]
20+
vars:
21+
cephadm_host_status: >-
22+
{{ cephadm_commands_result.results[0].stdout |
23+
from_json |
24+
selectattr('hostname', 'equalto', cephadm_hostname) |
25+
first }}

0 commit comments

Comments
 (0)