Merge pull request #153 from stackhpc/maintenance

markgoddard · web-flow · commit fc73a5d7a5ad · 2024-08-20T09:00:01.000+01:00
Add enter_maintenance and exit_maintenance roles
diff --git a/README.md b/README.md
@@ -15,6 +15,8 @@ Roles:
 * [commands](roles/commands/README.md) for running arbitrary commands
 * [crush_rules](roles/crush_rules/README.md) for defining CRUSH rules
 * [ec_profiles](roles/ec_profiles/README.md) for defining EC profiles
+* [enter_maintenance](roles/enter_maintenance/README.md) for placing hosts into maintenance
+* [exit_maintenance](roles/exit_maintenance/README.md) for removing hosts from maintenance
 * [keys](roles/keys/README.md) for defining auth keys
 * [pools](roles/pools/README.md) for defining pools
 
diff --git a/galaxy.yml b/galaxy.yml
@@ -1,6 +1,6 @@
 namespace: "stackhpc"
 name: "cephadm"
-version: "1.17.0"
+version: "1.18.0"
 readme: "README.md"
 authors:
   - "Michal Nasiadka"
diff --git a/roles/commands/README.md b/roles/commands/README.md
@@ -33,3 +33,9 @@ with at least one host in it - see the `cephadm` role for more details.
             - "fs new cephfs cephfs_metadata cephfs_data"
             - "orch apply mds cephfs --placement 3"
    ```
+
+* `cephadm_commands_until` A expression to evaluate to allow retrying commands. May reference the registered result variable, `cephadm_commands_result`. Default is `true` (do not use retries).
+
+* `cephadm_commands_retries`: Number of retries to use with `cephadm_commands_until`. Default is 0.
+
+* `cephadm_commands_delay`: Delay between retries with `cephadm_commands_until`. Default is 0.
diff --git a/roles/commands/defaults/main.yml b/roles/commands/defaults/main.yml
@@ -1,3 +1,6 @@
 ---
 cephadm_command: ceph
 cephadm_commands: []
+cephadm_commands_until: true
+cephadm_commands_retries: 0
+cephadm_commands_delay: 0
diff --git a/roles/commands/tasks/main.yml b/roles/commands/tasks/main.yml
@@ -7,6 +7,12 @@
   become: true
   changed_when: true
   when: cephadm_commands | length > 0
-
   delegate_to: "{{ groups['mons'][0] }}"
   run_once: true
+  until: cephadm_commands_until
+  retries: "{{ cephadm_commands_retries }}"
+  delay: "{{ cephadm_commands_delay }}"
+  vars:
+    # NOTE: Without this, the delegate hosts's ansible_host variable will not
+    # be respected.
+    ansible_host: "{{ hostvars[groups['mons'][0]].ansible_host | default(inventory_hostname) }}"
diff --git a/roles/enter_maintenance/README.md b/roles/enter_maintenance/README.md
@@ -0,0 +1,21 @@
+# enter_maintenance
+
+This role places Ceph hosts into maintenance mode using `cephadm`.
+
+## Prerequisites
+
+This role should be executed on one host at a time. This can be achieved by
+adding `serial: 1` to a play.
+
+### Host prerequisites
+
+* The role assumes target hosts connection over SSH with user that has passwordless sudo configured.
+* Either direct Internet access or private registry with desired Ceph image accessible to all hosts is required.
+
+### Inventory
+
+This role assumes the existence of the following groups:
+
+* `mons`
+
+with at least one host in it - see the `cephadm` role for more details.
diff --git a/roles/enter_maintenance/defaults/main.yml b/roles/enter_maintenance/defaults/main.yml
@@ -0,0 +1,2 @@
+---
+cephadm_hostname: "{{ ansible_facts.nodename }}"
diff --git a/roles/enter_maintenance/tasks/enter.yml b/roles/enter_maintenance/tasks/enter.yml
@@ -0,0 +1,68 @@
+---
+- name: Check if host can enter maintenance mode
+  ansible.builtin.include_role:
+    name: stackhpc.cephadm.commands
+  vars:
+    cephadm_commands:
+      - "orch host ok-to-stop {{ cephadm_hostname }}"
+
+# Annoyingly, 'ceph orch host ok-to-stop' does not exit non-zero when
+# it is not OK to stop, so we need to check for specific messages.
+- name: Assert that it is safe to stop host
+  ansible.builtin.assert:
+    that:
+      # This one is seen for monitors
+      - "'It is NOT safe' not in cephadm_commands_result.results[0].stderr"
+      # This one is seen for OSDs
+      - "'unsafe to stop' not in cephadm_commands_result.results[0].stderr"
+    fail_msg: "{{ cephadm_commands_result.results[0].stderr }}"
+
+- name: Fail over Ceph manager
+  ansible.builtin.include_role:
+    name: stackhpc.cephadm.commands
+  vars:
+    cephadm_commands:
+      - "mgr fail"
+  when: '"Cannot stop active Mgr daemon" in cephadm_commands_result.results[0].stderr'
+
+# RADOS Gateway services prevent a host from entering maintenance.
+# Remove the rgw label from the host and wait for Ceph orchestrator to remove
+# the service from the host.
+- name: Stop RADOS Gateway service
+  when: "'rgws' in group_names"
+  block:
+    - name: Ensure rgw label has been removed from node
+      ansible.builtin.include_role:
+        name: stackhpc.cephadm.commands
+      vars:
+        cephadm_commands:
+          - "orch host label rm {{ cephadm_hostname }} rgw"
+
+    - name: Wait for RADOS Gateway service to stop
+      ansible.builtin.include_role:
+        name: stackhpc.cephadm.commands
+      vars:
+        cephadm_commands:
+          - "orch ls rgw --format json-pretty"
+        cephadm_commands_until: >-
+          {{ (cephadm_commands_result.stdout | from_json)[0].status.running ==
+             (cephadm_commands_result.stdout | from_json)[0].status.size }}
+        cephadm_commands_retries: 30
+        cephadm_commands_delay: 10
+
+- name: Ensure host is in maintenance mode
+  block:
+    - name: Ensure host is in maintenance mode
+      ansible.builtin.include_role:
+        name: stackhpc.cephadm.commands
+      vars:
+        cephadm_commands:
+          - "orch host maintenance enter {{ cephadm_hostname }}"
+  always:
+    - name: Ensure rgw label has been added to node
+      ansible.builtin.include_role:
+        name: stackhpc.cephadm.commands
+      vars:
+        cephadm_commands:
+          - "orch host label add {{ cephadm_hostname }} rgw"
+      when: "'rgws' in group_names"
diff --git a/roles/enter_maintenance/tasks/main.yml b/roles/enter_maintenance/tasks/main.yml
@@ -0,0 +1,26 @@
+---
+- name: Assert that execution is serialised
+  ansible.builtin.assert:
+    that:
+      - ansible_play_batch | length == 1
+    fail_msg: >-
+      Hosts must be placed into maintenance one at a time in order to first check
+      whether it is safe to stop them.
+
+- name: List hosts in maintenance
+  ansible.builtin.include_role:
+    name: stackhpc.cephadm.commands
+  vars:
+    cephadm_commands:
+      - "orch host ls --format json-pretty --host_status maintenance"
+
+# Entering maintenance fails if the host is already in maintenance.
+- name: Enter maintenance
+  ansible.builtin.include_tasks: enter.yml
+  when: cephadm_hostname not in cephadm_hosts_in_maintenance
+  vars:
+    cephadm_hosts_in_maintenance: >-
+      {{ cephadm_commands_result.results[0].stdout |
+         from_json |
+         map(attribute='hostname') |
+         list }}
diff --git a/roles/exit_maintenance/README.md b/roles/exit_maintenance/README.md
@@ -0,0 +1,21 @@
+# exit_maintenance
+
+This role removes Ceph hosts from maintenance mode using `cephadm`.
+
+## Prerequisites
+
+This role should be executed on one host at a time. This can be achieved by
+adding `serial: 1` to a play.
+
+### Host prerequisites
+
+* The role assumes target hosts connection over SSH with user that has passwordless sudo configured.
+* Either direct Internet access or private registry with desired Ceph image accessible to all hosts is required.
+
+### Inventory
+
+This role assumes the existence of the following groups:
+
+* `mons`
+
+with at least one host in it - see the `cephadm` role for more details.
diff --git a/roles/exit_maintenance/defaults/main.yml b/roles/exit_maintenance/defaults/main.yml
@@ -0,0 +1,2 @@
+---
+cephadm_hostname: "{{ ansible_facts.nodename }}"
diff --git a/roles/exit_maintenance/tasks/exit.yml b/roles/exit_maintenance/tasks/exit.yml
@@ -0,0 +1,7 @@
+---
+- name: Ensure host has exited maintenance mode
+  ansible.builtin.include_role:
+    name: stackhpc.cephadm.commands
+  vars:
+    cephadm_commands:
+      - "orch host maintenance exit {{ cephadm_hostname }}"
diff --git a/roles/exit_maintenance/tasks/main.yml b/roles/exit_maintenance/tasks/main.yml
@@ -0,0 +1,25 @@
+---
+- name: Assert that execution is serialised
+  ansible.builtin.assert:
+    that:
+      - ansible_play_batch | length == 1
+    fail_msg: >-
+      Hosts must be removed from maintenance one at a time.
+
+- name: List hosts
+  ansible.builtin.include_role:
+    name: stackhpc.cephadm.commands
+  vars:
+    cephadm_commands:
+      - "orch host ls --format json-pretty"
+
+# Exiting maintenance fails if the host is not in maintenance or offline.
+- name: Exit maintenance
+  ansible.builtin.include_tasks: exit.yml
+  when: cephadm_host_status.status | lower in ["maintenance", "offline"]
+  vars:
+    cephadm_host_status: >-
+      {{ cephadm_commands_result.results[0].stdout |
+         from_json |
+         selectattr('hostname', 'equalto', cephadm_hostname) |
+         first }}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+---`
	`2`	`+cephadm_hostname: "{{ ansible_facts.nodename }}"`