From 3518a90277d08f88e46f822b92b2c1fa3cae6992 Mon Sep 17 00:00:00 2001 From: Georg Pfuetzenreuter Date: Mon, 22 Jul 2024 16:52:53 +0200 Subject: [PATCH] suse_ha: handle cluster restart Instead of relying on the administrator to put a node into maintenance mode before applying changes to the HA stack using Salt, have the state logic take care of restarting services. Signed-off-by: Georg Pfuetzenreuter --- suse_ha-formula/suse_ha/macros.jinja | 32 +++++++++++++ suse_ha-formula/suse_ha/map.jinja | 20 +++++++++ suse_ha-formula/suse_ha/pacemaker/init.sls | 20 +++++---- suse_ha-formula/suse_ha/quorum_wait_attempt | 9 ++++ suse_ha-formula/suse_ha/restart.sls | 50 +++++++++++++++++++++ suse_ha-formula/suse_ha/sbd.sls | 2 +- 6 files changed, 124 insertions(+), 9 deletions(-) create mode 100644 suse_ha-formula/suse_ha/quorum_wait_attempt create mode 100644 suse_ha-formula/suse_ha/restart.sls diff --git a/suse_ha-formula/suse_ha/macros.jinja b/suse_ha-formula/suse_ha/macros.jinja index aed88466..402092fc 100644 --- a/suse_ha-formula/suse_ha/macros.jinja +++ b/suse_ha-formula/suse_ha/macros.jinja @@ -114,3 +114,35 @@ ha_fencing_ipmi_secret_{{ host }}: - ha_resource_update_{{ host }} {%- endif %} {%- endmacro -%} + +{%- macro restart() -%} +{%- set file = '/var/adm/suse_ha_pending_restart' %} + +{%- if salt ['file.file_exists'](file) or is_standby %} + {%- if is_standby %} + +suse_ha_restart: + cmd.run: + - name: /usr/sbin/crm cluster restart + - shell: /bin/sh + - timeout: 600 + +suse_ha_clear_restart: + file.absent: + - name: {{ file }} + - require: + - cmd: suse_ha_restart + + {%- else %} + {%- do salt.log.warning('suse_ha: restart from previous execution is still pending, node is not in standby mode!') + {%- endif %} {#- close inner standby check #} + +{%- else %} + +suse_ha_note_restart_{{ service }}: + file.managed: + - name: {{ file }} + - replace: False + +{%- endif %} {#- close file or standby check #} +{%- endmacro -%} diff --git a/suse_ha-formula/suse_ha/map.jinja b/suse_ha-formula/suse_ha/map.jinja index 5514f1b1..b5b6ca44 100644 --- a/suse_ha-formula/suse_ha/map.jinja +++ b/suse_ha-formula/suse_ha/map.jinja @@ -74,3 +74,23 @@ along with this program. If not, see . {%- set is_primary = False -%} {%- endif -%} {%- do salt.log.debug('suse_ha: is_primary: ' ~ is_primary) -%} + +{%- set cmd_kwargs = { + 'clean_env': True, + 'python_shell': False, + 'shell': '/bin/sh', + } +-%} +{%- set cmd_kwargs_ir = { + **cmd_kwargs, + 'ignore_retcode': True, + } +-%} + +{%- if salt['cmd.has_exec']('/usr/sbin/crm_standby') and salt['cmd.retcode']('/usr/bin/systemctl is-active pacemaker', **cmd_kwargs_ir) == 0 %} + {%- set is_standby = salt['cmd.run_stdout']('/usr/sbin/crm_standby -Gq', **cmd_kwargs) == 'on' -%} + {%- do salt.log.debug('suse_ha: standby: ' ~ is_standby) -%} +{%- else -%} + {%- set is_standby = True -%} + {%- do salt.log.debug('suse_ha: assuming standby') -%} +{%- endif -%} diff --git a/suse_ha-formula/suse_ha/pacemaker/init.sls b/suse_ha-formula/suse_ha/pacemaker/init.sls index 0b450880..5f8f21bd 100644 --- a/suse_ha-formula/suse_ha/pacemaker/init.sls +++ b/suse_ha-formula/suse_ha/pacemaker/init.sls @@ -16,15 +16,15 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . -#} -{%- from 'suse_ha/map.jinja' import cluster, fencing, sysconfig -%} +{%- from 'suse_ha/map.jinja' import cluster, cmd_kwargs, fencing, is_standby, sysconfig -%} {%- from 'suse_ha/macros.jinja' import ha_resource, property, rsc_default, ipmi_secret -%} {%- set myfqdn = grains['fqdn'] -%} {%- set myhost = grains['host'] -%} -{%- if salt['cmd.retcode']('test -x /usr/sbin/crmadmin') == 0 -%} -{%- set clusterdc = salt['cmd.run']('/usr/sbin/crmadmin -q -D 1') -%} +{%- if salt['cmd.has_exec']('/usr/sbin/crmadmin') -%} + {%- set clusterdc = salt['cmd.run']('/usr/sbin/crmadmin -q -D 1', **cmd_kwargs) -%} {%- else -%} -{%- do salt.log.error('crmadmin is not available!') -%} -{%- set clusterdc = None -%} + {%- do salt.log.error('crmadmin is not available!') -%} + {%- set clusterdc = None -%} {%- endif -%} {% if myfqdn == clusterdc or myhost == clusterdc %} @@ -91,10 +91,14 @@ include: {%- endif %} {%- endif %} - suse_ha.resources + - suse_ha.restart {%- else %} {%- do salt.log.info('Not sending any Pacemaker configuration - ' ~ myfqdn ~ ' is not the designated controller.') -%} +include: + - suse_ha.restart + {%- if fencing.enable and 'ipmi' in fencing %} {%- for host, config in fencing.ipmi.hosts.items() %} {{ ipmi_secret(host, config['secret'], False) }} @@ -107,7 +111,7 @@ include: pacemaker.service: service.running: - enable: True - - reload: True + - reload: False - retry: attempts: 3 interval: 10 @@ -116,8 +120,6 @@ pacemaker.service: - suse_ha_packages - corosync.service {%- if sysconfig.pacemaker | length %} - - watch: - - suse_sysconfig: /etc/sysconfig/pacemaker suse_sysconfig.sysconfig: - name: /etc/sysconfig/pacemaker - header_pillar: managed_by_salt_formula_sysconfig @@ -130,6 +132,8 @@ pacemaker.service: {%- endfor %} - require: - suse_ha_packages + - watch_in: + - cmd: suse_ha_restart {%- endif %} {%- else %} {%- do salt.log.error('suse_ha: cluster pillar not configured, not enabling Pacemaker!') %} diff --git a/suse_ha-formula/suse_ha/quorum_wait_attempt b/suse_ha-formula/suse_ha/quorum_wait_attempt new file mode 100644 index 00000000..1b9789a3 --- /dev/null +++ b/suse_ha-formula/suse_ha/quorum_wait_attempt @@ -0,0 +1,9 @@ +pacemaker_wait_for_quorum: + loop.until_no_eval: + - name: cmd.run + - args: + - crm_node -q + - expected: '1' + - period: 5 + - timeout: 30 + diff --git a/suse_ha-formula/suse_ha/restart.sls b/suse_ha-formula/suse_ha/restart.sls new file mode 100644 index 00000000..25055981 --- /dev/null +++ b/suse_ha-formula/suse_ha/restart.sls @@ -0,0 +1,50 @@ +{#- +Salt state file for managing SUSE HA cluster restarts +Copyright (C) 2024 SUSE LLC + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +-#} + +{%- from 'suse_ha/map.jinja' import is_standby -%} +{%- set file = '/var/adm/suse_ha_pending_restart' %} + +{%- if salt ['file.file_exists'](file) or is_standby %} + {%- if is_standby %} + +suse_ha_restart: + cmd.run: + - name: /usr/sbin/crm cluster restart + - shell: /bin/sh + - timeout: 600 + +suse_ha_clear_pending_restart: + file.absent: + - name: {{ file }} + - require: + - cmd: suse_ha_restart + + {%- else %} + {%- do salt.log.warning('suse_ha: restart from previous execution is still pending, node is not in standby mode!') + {%- endif %} {#- close inner standby check #} + +{%- else %} + +suse_ha_restart: + # file.managed would be more appropriate, but the file module does not offer a mod_watch function and we would need additional logic to differentiate between cmd and file in the calling watch directives + cmd.run: + - name: touch {{ file }} + - creates: {{ file }} + - shell: /bin/sh + +{%- endif %} {#- close file or standby check #} diff --git a/suse_ha-formula/suse_ha/sbd.sls b/suse_ha-formula/suse_ha/sbd.sls index 9e0a9ea9..252f429e 100644 --- a/suse_ha-formula/suse_ha/sbd.sls +++ b/suse_ha-formula/suse_ha/sbd.sls @@ -16,7 +16,7 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . -#} -{%- from 'suse_ha/map.jinja' import sbd, sysconfig, is_primary -%} +{%- from 'suse_ha/map.jinja' import sbd, sysconfig, is_primary, is_standby -%} {%- if 'devices' in sbd %} include: - .packages