From 9347b65d5b9a25207c80ed4558bfec0bfe1f9539 Mon Sep 17 00:00:00 2001 From: SK Ali Arman Date: Fri, 17 Apr 2026 19:22:43 +0600 Subject: [PATCH 1/6] InnoDB Cluster Support Signed-off-by: SK Ali Arman --- scripts/run_innodb.sh | 61 +++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/scripts/run_innodb.sh b/scripts/run_innodb.sh index f422731..33dd1be 100755 --- a/scripts/run_innodb.sh +++ b/scripts/run_innodb.sh @@ -13,7 +13,7 @@ function log() { #stores all the arguments that are passed from statefulSet args=$@ -report_host="$HOSTNAME.$GOV_SVC.$POD_NAMESPACE.svc" +report_host="$HOSTNAME.$GOV_SVC.$POD_NAMESPACE" log "INFO" "report_host = $report_host" # wait for the peer-list file created by coordinator while [ ! -f "/scripts/peer-list" ]; do @@ -102,19 +102,29 @@ function create_replication_user() { # if the user doesn't exist, crete new one. if [[ "$out" -eq "0" ]]; then log "INFO" "Replication user not found. Creating new replication user..." - retry 120 ${mysql} -N -e "SET SQL_LOG_BIN=0;" - retry 120 ${mysql} -N -e "CREATE USER 'repl'@'%' IDENTIFIED BY '${MYSQL_ROOT_PASSWORD}' REQUIRE SSL;" - retry 120 ${mysql} -N -e "GRANT CREATE USER, FILE, PROCESS, RELOAD, REPLICATION CLIENT, REPLICATION SLAVE, SELECT, SHUTDOWN, SUPER ON *.* TO 'repl'@'%' WITH GRANT OPTION;" - retry 120 ${mysql} -N -e "GRANT DELETE, INSERT, UPDATE ON mysql.* TO 'repl'@'%' WITH GRANT OPTION;" - retry 120 ${mysql} -N -e "GRANT ALTER, ALTER ROUTINE, CREATE, CREATE ROUTINE, CREATE TEMPORARY TABLES, CREATE VIEW, DELETE, DROP, EVENT, EXECUTE, INDEX, INSERT, LOCK TABLES, REFERENCES, SHOW VIEW, TRIGGER, UPDATE ON mysql_innodb_cluster_metadata.* TO 'repl'@'%' WITH GRANT OPTION;" - retry 120 ${mysql} -N -e "GRANT ALTER, ALTER ROUTINE, CREATE, CREATE ROUTINE, CREATE TEMPORARY TABLES, CREATE VIEW, DELETE, DROP, EVENT, EXECUTE, INDEX, INSERT, LOCK TABLES, REFERENCES, SHOW VIEW, TRIGGER, UPDATE ON mysql_innodb_cluster_metadata_bkp.* TO 'repl'@'%' WITH GRANT OPTION;" - retry 120 ${mysql} -N -e "GRANT ALTER, ALTER ROUTINE, CREATE, CREATE ROUTINE, CREATE TEMPORARY TABLES, CREATE VIEW, DELETE, DROP, EVENT, EXECUTE, INDEX, INSERT, LOCK TABLES, REFERENCES, SHOW VIEW, TRIGGER, UPDATE ON mysql_innodb_cluster_metadata_previous.* TO 'repl'@'%' WITH GRANT OPTION;" - retry 120 ${mysql} -N -e "GRANT CLONE_ADMIN, BACKUP_ADMIN, CONNECTION_ADMIN, EXECUTE, GROUP_REPLICATION_ADMIN, PERSIST_RO_VARIABLES_ADMIN, REPLICATION_APPLIER, REPLICATION_SLAVE_ADMIN, ROLE_ADMIN, SYSTEM_VARIABLES_ADMIN ON *.* TO 'repl'@'%' WITH GRANT OPTION;" - #mysql-server docker image doesn't has the user root that can connect from any host - retry 10 ${mysql} -N -e "CREATE USER 'root'@'%' IDENTIFIED BY '${MYSQL_ROOT_PASSWORD}';" - retry 120 ${mysql} -N -e "GRANT ALL ON *.* TO 'root'@'%' WITH GRANT OPTION;" - retry 120 ${mysql} -N -e "FLUSH PRIVILEGES;" - retry 120 ${mysql} -N -e "SET SQL_LOG_BIN=1;" + retry 120 ${mysql} -N -e " + SET SQL_LOG_BIN=0; + CREATE USER IF NOT EXISTS 'repl'@'%' IDENTIFIED BY '${MYSQL_ROOT_PASSWORD}' REQUIRE SSL; + GRANT CREATE USER, FILE, PROCESS, RELOAD, REPLICATION CLIENT, REPLICATION SLAVE, SELECT, SHUTDOWN, SUPER ON *.* TO 'repl'@'%' WITH GRANT OPTION; + GRANT DELETE, INSERT, UPDATE ON mysql.* TO 'repl'@'%' WITH GRANT OPTION; + GRANT ALTER, ALTER ROUTINE, CREATE, CREATE ROUTINE, CREATE TEMPORARY TABLES, CREATE VIEW, DELETE, DROP, EVENT, EXECUTE, INDEX, INSERT, LOCK TABLES, REFERENCES, SHOW VIEW, TRIGGER, UPDATE ON mysql_innodb_cluster_metadata.* TO 'repl'@'%' WITH GRANT OPTION; + GRANT ALTER, ALTER ROUTINE, CREATE, CREATE ROUTINE, CREATE TEMPORARY TABLES, CREATE VIEW, DELETE, DROP, EVENT, EXECUTE, INDEX, INSERT, LOCK TABLES, REFERENCES, SHOW VIEW, TRIGGER, UPDATE ON mysql_innodb_cluster_metadata_bkp.* TO 'repl'@'%' WITH GRANT OPTION; + GRANT ALTER, ALTER ROUTINE, CREATE, CREATE ROUTINE, CREATE TEMPORARY TABLES, CREATE VIEW, DELETE, DROP, EVENT, EXECUTE, INDEX, INSERT, LOCK TABLES, REFERENCES, SHOW VIEW, TRIGGER, UPDATE ON mysql_innodb_cluster_metadata_previous.* TO 'repl'@'%' WITH GRANT OPTION; + GRANT CLONE_ADMIN, BACKUP_ADMIN, CONNECTION_ADMIN, EXECUTE, GROUP_REPLICATION_ADMIN, PERSIST_RO_VARIABLES_ADMIN, REPLICATION_APPLIER, REPLICATION_SLAVE_ADMIN, ROLE_ADMIN, SYSTEM_VARIABLES_ADMIN ON *.* TO 'repl'@'%' WITH GRANT OPTION; + CREATE USER IF NOT EXISTS 'root'@'%' IDENTIFIED BY '${MYSQL_ROOT_PASSWORD}'; + GRANT ALL ON *.* TO 'root'@'%' WITH GRANT OPTION; + FLUSH PRIVILEGES; + SET SQL_LOG_BIN=1; + " + else + log "INFO" "Replication user exists. Updating password if changed..." + retry 120 ${mysql} -N -e " + SET SQL_LOG_BIN=0; + ALTER USER 'repl'@'%' IDENTIFIED BY '${MYSQL_ROOT_PASSWORD}'; + ALTER USER IF EXISTS 'root'@'%' IDENTIFIED BY '${MYSQL_ROOT_PASSWORD}'; + FLUSH PRIVILEGES; + SET SQL_LOG_BIN=1; + " fi # retry 120 ${mysql} -N -e "CHANGE MASTER TO MASTER_USER='repl', MASTER_PASSWORD='$MYSQL_ROOT_PASSWORD' FOR CHANNEL 'group_replication_recovery';" touch /scripts/ready.txt @@ -125,7 +135,7 @@ already_configured=0 function configure_instance() { log "INFO" "configuring instance $report_host." - local mysqlshell="mysqlsh -u${replication_user} -p${MYSQL_ROOT_PASSWORD}" + local mysqlshell="mysqlsh -u${MYSQL_ROOT_USERNAME} -p${MYSQL_ROOT_PASSWORD}" retry 120 ${mysqlshell} --sql -e "select @@gtid_mode;" gtid=($($mysqlshell --sql -e "select @@gtid_mode;")) @@ -135,22 +145,17 @@ function configure_instance() { return fi - retry 30 ${mysqlshell} -e "dba.configureInstance('${replication_user}@${report_host}',{password:'${MYSQL_ROOT_PASSWORD}',interactive:false,restart:false});" - #instance need to restart after configuration - # Prevent creation of new process until this one is finished - #https://serverfault.com/questions/477448/mysql-keeps-crashing-innodb-unable-to-lock-ibdata1-error-11 - #The most common cause of this problem is trying to start MySQL when it is already running. + yes | ${mysqlshell} -e "dba.configureInstance('${MYSQL_ROOT_USERNAME}:${MYSQL_ROOT_PASSWORD}@${report_host}:3306',{mycnfPath:'/etc/mysql/my.cnf',restart:false});" - #for non-root users, set the restart flag to false, stop the mysqld process, set restart_required=1 to start the process mysqladmin -u ${MYSQL_ROOT_USERNAME} -hlocalhost -p${MYSQL_ROOT_PASSWORD} --port=3306 shutdown wait $pid restart_required=1 } function create_cluster() { - local mysqlshell="mysqlsh -u${replication_user} -p${MYSQL_ROOT_PASSWORD} -h${report_host}" + local mysqlshell="mysqlsh -u${MYSQL_ROOT_USERNAME} -p${MYSQL_ROOT_PASSWORD} -h${report_host}" clusterName=$(echo -n $BASE_NAME | sed 's/-/_/g') - retry 5 $mysqlshell -e "cluster=dba.createCluster('$clusterName',{consistency:'BEFORE_ON_PRIMARY_FAILOVER',manualStartOnBoot:'true'});" + retry 5 $mysqlshell -e "cluster=dba.createCluster('$clusterName',{communicationStack:'MYSQL',manualStartOnBoot:true});" } export primary="" @@ -246,7 +251,7 @@ function rejoin_in_cluster() { export pid function reboot_from_completeOutage() { - local mysqlshell="mysqlsh -u${replication_user} -h${report_host} -p${MYSQL_ROOT_PASSWORD}" + local mysqlshell="mysqlsh -u${MYSQL_ROOT_USERNAME} -h${report_host} -p${MYSQL_ROOT_PASSWORD}" #https://dev.mysql.com/doc/dev/mysqlsh-api-javascript/8.0/classmysqlsh_1_1dba_1_1_dba.html#ac68556e9a8e909423baa47dc3b42aadb #mysql wait for user interaction to remove the unavailable seed from the cluster.. clusterName=$(echo -n $BASE_NAME | sed 's/-/_/g') @@ -257,7 +262,7 @@ function reboot_from_completeOutage() { function start_mysqld_in_background() { log "INFO" "Starting mysql server with 'docker-entrypoint.sh mysqld $args'..." - /entrypoint.sh mysqld --user=root --report-host=$report_host --bind-address=* $args & + docker-entrypoint.sh mysqld --user=root --report-host=$report_host --bind-address=* $args & pid=$! log "INFO" "The process id of mysqld is '$pid'" } @@ -271,7 +276,7 @@ configure_instance if [[ "$restart_required" == "1" ]]; then start_mysqld_in_background - wait_for_host_online "repl" "$report_host" "$MYSQL_ROOT_PASSWORD" + wait_for_host_online "${MYSQL_ROOT_USERNAME}" "$report_host" "$MYSQL_ROOT_PASSWORD" fi mysqld_alive=0 @@ -292,7 +297,7 @@ while true; do else echo "need start mysqld and wait_for_mysqld_running" start_mysqld_in_background - wait_for_host_online "repl" "$report_host" "$MYSQL_ROOT_PASSWORD" + wait_for_host_online "${MYSQL_ROOT_USERNAME}" "$report_host" "$MYSQL_ROOT_PASSWORD" fi # wait for the script copied by coordinator @@ -326,7 +331,7 @@ while true; do select_primary join_by_clone start_mysqld_in_background - wait_for_host_online "repl" "$report_host" "$MYSQL_ROOT_PASSWORD" + wait_for_host_online "${MYSQL_ROOT_USERNAME}" "$report_host" "$MYSQL_ROOT_PASSWORD" join_in_cluster fi From b50979feb3cc27a55320a45697ce2617122dcd18 Mon Sep 17 00:00:00 2001 From: SK Ali Arman Date: Fri, 17 Apr 2026 19:35:51 +0600 Subject: [PATCH 2/6] update Signed-off-by: SK Ali Arman --- scripts/run_innodb.sh | 84 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 72 insertions(+), 12 deletions(-) diff --git a/scripts/run_innodb.sh b/scripts/run_innodb.sh index 33dd1be..66577c8 100755 --- a/scripts/run_innodb.sh +++ b/scripts/run_innodb.sh @@ -1,5 +1,24 @@ #!/usr/bin/env bash #set -x +# run_innodb.sh — MySQL InnoDB Cluster init script +# Compatibility: MySQL 8.0.x + +env | sort | grep "POD\|HOST\|NAME" +RECOVERY_DONE_FILE="/tmp/recovery.done" +if [[ "$PITR_RESTORE" == "true" ]]; then + while true; do + sleep 2 + echo "Point In Time Recovery In Progress. Waiting for $RECOVERY_DONE_FILE file" + if [[ -e "$RECOVERY_DONE_FILE" ]]; then + echo "$RECOVERY_DONE_FILE found." + break + fi + done +fi + +if [[ -e "$RECOVERY_DONE_FILE" ]]; then + rm $RECOVERY_DONE_FILE +fi function timestamp() { date +"%Y/%m/%d %T" @@ -46,6 +65,7 @@ cat >>/etc/mysql/default.d/my.cnf </dev/null + log "INFO" "Set super_read_only=ON to prevent errant GTIDs" } +# mysql client shorthand — always use root for local operations +mysql_local="mysql -u${MYSQL_ROOT_USERNAME} -hlocalhost -p${MYSQL_ROOT_PASSWORD} --port=3306" +replication_user=repl + function create_replication_user() { - # MySql server's need a replication user to communicate with each other - # 01. official doc (section from 17.2.1.3 to 17.2.1.5): https://dev.mysql.com/doc/refman/5.7/en/group-replication-user-credentials.html - # 02. https://dev.mysql.com/doc/refman/8.0/en/group-replication-secure-user.html - # 03. repl user permissions: https://www.sqlshack.com/deploy-mysql-innodb-clusters-for-high-availability/ - # 04. digitalocean doc: https://www.digitalocean.com/community/tutorials/how-to-configure-mysql-group-replication-on-ubuntu-16-04 log "INFO" "Checking whether replication user exist or not..." local mysql="mysql -u ${MYSQL_ROOT_USERNAME} -hlocalhost -p${MYSQL_ROOT_PASSWORD} --port=3306" @@ -104,6 +133,8 @@ function create_replication_user() { log "INFO" "Replication user not found. Creating new replication user..." retry 120 ${mysql} -N -e " SET SQL_LOG_BIN=0; + SET GLOBAL super_read_only=OFF; + SET GLOBAL read_only=OFF; CREATE USER IF NOT EXISTS 'repl'@'%' IDENTIFIED BY '${MYSQL_ROOT_PASSWORD}' REQUIRE SSL; GRANT CREATE USER, FILE, PROCESS, RELOAD, REPLICATION CLIENT, REPLICATION SLAVE, SELECT, SHUTDOWN, SUPER ON *.* TO 'repl'@'%' WITH GRANT OPTION; GRANT DELETE, INSERT, UPDATE ON mysql.* TO 'repl'@'%' WITH GRANT OPTION; @@ -120,13 +151,16 @@ function create_replication_user() { log "INFO" "Replication user exists. Updating password if changed..." retry 120 ${mysql} -N -e " SET SQL_LOG_BIN=0; + SET GLOBAL super_read_only=OFF; + SET GLOBAL read_only=OFF; ALTER USER 'repl'@'%' IDENTIFIED BY '${MYSQL_ROOT_PASSWORD}'; ALTER USER IF EXISTS 'root'@'%' IDENTIFIED BY '${MYSQL_ROOT_PASSWORD}'; FLUSH PRIVILEGES; SET SQL_LOG_BIN=1; " fi - # retry 120 ${mysql} -N -e "CHANGE MASTER TO MASTER_USER='repl', MASTER_PASSWORD='$MYSQL_ROOT_PASSWORD' FOR CHANNEL 'group_replication_recovery';" + # Re-enable read_only after user creation + ${mysql} -N -e "SET GLOBAL read_only=ON; SET GLOBAL super_read_only=ON;" 2>/dev/null touch /scripts/ready.txt } @@ -180,7 +214,7 @@ already_in_cluster=0 function is_already_in_cluster() { local mysqlshell="mysqlsh -u${replication_user} -p${MYSQL_ROOT_PASSWORD} -h${primary}" - ${mysqlshell} -e "cluster = dba.getCluster(); cluster.rescan({addInstances:['${report_host}:3306'],interactive:false})" + ${mysqlshell} -e "cluster = dba.getCluster(); cluster.rescan()" out=($(${mysqlshell} --sql -e "SELECT member_host FROM performance_schema.replication_group_members where member_state='ONLINE';")) for host in ${out[@]}; do @@ -224,7 +258,7 @@ check_instance_joined_in_cluster() { function make_sure_instance_join_in_cluster() { local mysqlshell="mysqlsh -u${replication_user} -p${MYSQL_ROOT_PASSWORD} -h${primary}" - retry 10 ${mysqlshell} -e "cluster = dba.getCluster(); cluster.rescan({addInstances:['${report_host}:3306'],interactive:false})" + retry 10 ${mysqlshell} -e "cluster = dba.getCluster(); cluster.rescan()" } function rejoin_in_cluster() { @@ -255,6 +289,19 @@ function reboot_from_completeOutage() { #https://dev.mysql.com/doc/dev/mysqlsh-api-javascript/8.0/classmysqlsh_1_1dba_1_1_dba.html#ac68556e9a8e909423baa47dc3b42aadb #mysql wait for user interaction to remove the unavailable seed from the cluster.. clusterName=$(echo -n $BASE_NAME | sed 's/-/_/g') + + # Stop GR on any peer stuck in ERROR state before reboot. + # dba.rebootClusterFromCompleteOutage() refuses to proceed if any peer has GR + # in ERROR state ("belongs to a GR group that is not managed as an InnoDB Cluster"). + for host in "${peers[@]}"; do + peer_state=$(mysql -u${MYSQL_ROOT_USERNAME} -h${host} -p${MYSQL_ROOT_PASSWORD} --port=3306 -N -e \ + "SELECT MEMBER_STATE FROM performance_schema.replication_group_members LIMIT 1;" 2>/dev/null) + if [[ "$peer_state" == "ERROR" ]]; then + log "INFO" "Stopping GR on $host (stuck in ERROR state) before cluster reboot..." + mysql -u${MYSQL_ROOT_USERNAME} -h${host} -p${MYSQL_ROOT_PASSWORD} --port=3306 -N -e "STOP GROUP_REPLICATION;" 2>/dev/null + fi + done + yes | $mysqlshell -e "dba.rebootClusterFromCompleteOutage('$clusterName',{force:'true'})" yes | $mysqlshell -e "cluster = dba.getCluster(); cluster.rescan()" wait $pid @@ -267,8 +314,6 @@ function start_mysqld_in_background() { log "INFO" "The process id of mysqld is '$pid'" } -replication_user=repl - start_mysqld_in_background wait_for_host_online "root" "localhost" "$MYSQL_ROOT_PASSWORD" create_replication_user @@ -300,12 +345,25 @@ while true; do wait_for_host_online "${MYSQL_ROOT_USERNAME}" "$report_host" "$MYSQL_ROOT_PASSWORD" fi - # wait for the script copied by coordinator + # wait for the signal file from coordinator + # Also check if this node is already ONLINE in GR — this happens when + # another pod's coordinator called rebootClusterFromCompleteOutage() which + # rejoins all members remotely via mysqlsh AdminAPI, bypassing this script. while [ ! -f "/scripts/signal.txt" ]; do + member_state=$(mysql -u${MYSQL_ROOT_USERNAME} -hlocalhost -p${MYSQL_ROOT_PASSWORD} -N -e \ + "SELECT MEMBER_STATE FROM performance_schema.replication_group_members WHERE MEMBER_HOST='${report_host}' LIMIT 1;" 2>/dev/null) + if [[ "$member_state" == "ONLINE" ]]; then + log "INFO" "Already ONLINE in GR group (joined by another node's reboot) — skipping signal wait" + break + fi log "WARNING" "signal is not present yet!" sleep 1 done + # If we broke out because GR is already ONLINE (no signal file), skip to wait. + if [ ! -f "/scripts/signal.txt" ]; then + log "INFO" "No signal to execute — node already joined via external reboot" + else desired_func=$(cat /scripts/signal.txt) rm -rf /scripts/signal.txt log "INFO" "going to execute $desired_func" @@ -338,7 +396,9 @@ while true; do if [[ $desired_func == "reboot_from_complete_outage" ]]; then reboot_from_completeOutage fi - log "INFO" "waiting for mysql process id = $pid" + fi + + log "INFO" "waiting for mysql process id = $pid" wait $pid rm -rf /scripts/signal.txt From 930dfb0756734f623009eecd2a9d0265dc87928c Mon Sep 17 00:00:00 2001 From: SK Ali Arman Date: Mon, 20 Apr 2026 12:10:51 +0600 Subject: [PATCH 3/6] fix auth rotate Signed-off-by: SK Ali Arman --- scripts/run_innodb.sh | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/scripts/run_innodb.sh b/scripts/run_innodb.sh index 66577c8..a6d23cf 100755 --- a/scripts/run_innodb.sh +++ b/scripts/run_innodb.sh @@ -121,6 +121,33 @@ function wait_for_host_online() { mysql_local="mysql -u${MYSQL_ROOT_USERNAME} -hlocalhost -p${MYSQL_ROOT_PASSWORD} --port=3306" replication_user=repl +# Kill any stale mysqlsh AdminAPI session holding the cluster-wide EXCLUSIVE lock +# on $1 (usually the primary). A session holding AdminAPI_lock while in Sleep +# state means a previous mysqlsh call died without releasing — rescan/addInstance/ +# rejoinInstance will hang with MYSQLSH 51500. Legitimate in-flight AdminAPI ops +# are always in Query state, never Sleep. Kill Sleep>5s holders to auto-recover. +function clear_stale_cluster_lock() { + local target_host=$1 + local mysql_root="mysql -u${MYSQL_ROOT_USERNAME} -h${target_host} -p${MYSQL_ROOT_PASSWORD} --port=3306 -N" + local stuck_ids + stuck_ids=$(${mysql_root} -e " + SELECT t.PROCESSLIST_ID + FROM performance_schema.metadata_locks m + JOIN performance_schema.threads t ON m.OWNER_THREAD_ID = t.THREAD_ID + WHERE m.OBJECT_SCHEMA='AdminAPI_cluster' + AND m.OBJECT_NAME='AdminAPI_lock' + AND m.LOCK_TYPE='EXCLUSIVE' + AND t.PROCESSLIST_COMMAND='Sleep' + AND t.PROCESSLIST_TIME > 5;" 2>/dev/null | awk 'NF') + if [[ -n "$stuck_ids" ]]; then + for stuck_id in $stuck_ids; do + log "WARNING" "Killing stale AdminAPI_lock holder on ${target_host} (conn=${stuck_id}, Sleep>5s)" + ${mysql_root} -e "KILL ${stuck_id};" 2>/dev/null + done + sleep 2 + fi +} + function create_replication_user() { log "INFO" "Checking whether replication user exist or not..." local mysql="mysql -u ${MYSQL_ROOT_USERNAME} -hlocalhost -p${MYSQL_ROOT_PASSWORD} --port=3306" @@ -214,6 +241,7 @@ already_in_cluster=0 function is_already_in_cluster() { local mysqlshell="mysqlsh -u${replication_user} -p${MYSQL_ROOT_PASSWORD} -h${primary}" + clear_stale_cluster_lock "${primary}" ${mysqlshell} -e "cluster = dba.getCluster(); cluster.rescan()" out=($(${mysqlshell} --sql -e "SELECT member_host FROM performance_schema.replication_group_members where member_state='ONLINE';")) @@ -229,13 +257,16 @@ function is_already_in_cluster() { function join_in_cluster() { log "INFO " "$report_host joining in cluster" local mysqlshell="mysqlsh -u${replication_user} -p${MYSQL_ROOT_PASSWORD} -h${primary}" + clear_stale_cluster_lock "${primary}" retry 10 ${mysqlshell} -e "cluster = dba.getCluster();cluster.addInstance('${replication_user}@${report_host}',{recoveryMethod:'incremental'});" } function join_by_clone() { log "INFO " "$report_host joining in cluster" local mysqlshell="mysqlsh -u${replication_user} -p${MYSQL_ROOT_PASSWORD} -h${primary}" + clear_stale_cluster_lock "${primary}" retry 10 ${mysqlshell} -e "cluster = dba.getCluster();cluster.removeInstance('$report_host',{force:'true'});" + clear_stale_cluster_lock "${primary}" retry 10 ${mysqlshell} -e "cluster = dba.getCluster(); cluster.addInstance('${replication_user}@${report_host}',{recoveryMethod:'clone'});" #this is required for clone method @@ -258,11 +289,13 @@ check_instance_joined_in_cluster() { function make_sure_instance_join_in_cluster() { local mysqlshell="mysqlsh -u${replication_user} -p${MYSQL_ROOT_PASSWORD} -h${primary}" + clear_stale_cluster_lock "${primary}" retry 10 ${mysqlshell} -e "cluster = dba.getCluster(); cluster.rescan()" } function rejoin_in_cluster() { local mysqlshell="mysqlsh -u${replication_user} -p${MYSQL_ROOT_PASSWORD} -h${primary}" + clear_stale_cluster_lock "${primary}" ${mysqlshell} -e "cluster=dba.getCluster(); cluster.rejoinInstance('${replication_user}@${report_host}')" out=($(${mysqlshell} --sql -e "SELECT member_host FROM performance_schema.replication_group_members;")) @@ -277,6 +310,7 @@ function rejoin_in_cluster() { fi check_instance_joined_in_cluster if [[ "$joined_in_cluster" == "0" ]]; then + clear_stale_cluster_lock "${primary}" retry 1 ${mysqlshell} -e "cluster = dba.getCluster();cluster.removeInstance('$report_host',{force:'true'});" join_in_cluster fi @@ -303,6 +337,7 @@ function reboot_from_completeOutage() { done yes | $mysqlshell -e "dba.rebootClusterFromCompleteOutage('$clusterName',{force:'true'})" + clear_stale_cluster_lock "${report_host}" yes | $mysqlshell -e "cluster = dba.getCluster(); cluster.rescan()" wait $pid } From 37314d82a15434c079cf006e1a555dbcb461e938 Mon Sep 17 00:00:00 2001 From: SK Ali Arman Date: Tue, 28 Apr 2026 17:04:36 +0600 Subject: [PATCH 4/6] add mysqld starting indicator Signed-off-by: SK Ali Arman --- scripts/run.sh | 24 +++++++++++++++++------- scripts/run_innodb.sh | 5 +++-- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/scripts/run.sh b/scripts/run.sh index 5b84d85..870594d 100755 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -15,6 +15,7 @@ # POD_IP_TYPE = Address type of POD_IP (one of IPV4, IPv6) env | sort | grep "POD\|HOST\|NAME" +echo "running">/scripts/setup.txt RECOVERY_DONE_FILE="/tmp/recovery.done" if [[ "$PITR_RESTORE" == "true" ]]; then while true; do @@ -181,23 +182,31 @@ EOL # wait for mysql daemon be running (alive) function wait_for_mysqld_running() { local mysql="$mysql_header --host=$localhost" + local max_restarts=60 + local restarts=0 - for i in {900..0}; do + while true; do + if ! kill -0 "$pid" 2>/dev/null; then + if (( restarts >= max_restarts )); then + log "ERROR" "mysqld (pid=$pid) died and exceeded $max_restarts restart attempts. Aborting." + exit 1 + fi + restarts=$((restarts + 1)) + log "ERROR" "mysqld (pid=$pid) is no longer running. Restart attempt $restarts/$max_restarts..." + start_mysqld_in_background + sleep 10 + continue + fi out=$(${mysql} -N -e "select 1;" 2>/dev/null) log "INFO" "Attempt $i: Pinging '$report_host' has returned: '$out'...................................." if [[ "$out" == "1" ]]; then break fi - + log "INFO" "Pinging '$report_host' has returned: '$out' (pid=$pid alive, restarts=$restarts)" echo -n . sleep 1 done - if [[ "$i" == "0" ]]; then - echo "" - log "ERROR" "Server ${report_host} failed to start in 900 seconds............." - exit 1 - fi log "INFO" "mysql daemon is ready to use......." # Set read-only immediately after MySQL starts to prevent any external @@ -633,5 +642,6 @@ while true; do fi joining_for_first_time=0 log "INFO" "waiting for mysql process id = $pid" + rm -rf /scripts/setup.txt wait $pid done diff --git a/scripts/run_innodb.sh b/scripts/run_innodb.sh index a6d23cf..34ae1e4 100755 --- a/scripts/run_innodb.sh +++ b/scripts/run_innodb.sh @@ -4,6 +4,7 @@ # Compatibility: MySQL 8.0.x env | sort | grep "POD\|HOST\|NAME" +echo "running">/scripts/setup.txt RECOVERY_DONE_FILE="/tmp/recovery.done" if [[ "$PITR_RESTORE" == "true" ]]; then while true; do @@ -434,7 +435,7 @@ while true; do fi log "INFO" "waiting for mysql process id = $pid" - wait $pid rm -rf /scripts/signal.txt - + rm -rf /scripts/setup.txt + wait $pid done From 8d4e84d29ba00d0ddd0a9eea48e6eb17d7023eca Mon Sep 17 00:00:00 2001 From: SK Ali Arman Date: Tue, 28 Apr 2026 17:09:47 +0600 Subject: [PATCH 5/6] add mysqld starting indicator for innodb Signed-off-by: SK Ali Arman --- scripts/run_innodb.sh | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/scripts/run_innodb.sh b/scripts/run_innodb.sh index 34ae1e4..ab03d62 100755 --- a/scripts/run_innodb.sh +++ b/scripts/run_innodb.sh @@ -103,13 +103,32 @@ function wait_for_host_online() { log "INFO" "checking for host $2 to come online" local mysqlshell="mysql -u$1 -h$2 -p$3" # "mysql -uroot -ppass -hmysql-server-0.mysql-server.default.svc" - retry 900 ${mysqlshell} -e "select 1;" | awk '{print$1}' - out=$(${mysqlshell} -e "select 1;" | head -n1 | awk '{print$1}') - if [[ "$out" == "1" ]]; then - log "INFO" "host $2 is online" - else - log "INFO" "server failed to comes online within 900 seconds" - fi + local max_restarts=60 + local restarts=0 + + while true; do + if ! kill -0 "$pid" 2>/dev/null; then + if (( restarts >= max_restarts )); then + log "ERROR" "mysqld (pid=$pid) died and exceeded $max_restarts restart attempts. Aborting." + exit 1 + fi + restarts=$((restarts + 1)) + log "ERROR" "mysqld (pid=$pid) is no longer running. Restart attempt $restarts/$max_restarts..." + start_mysqld_in_background + sleep 10 + continue + fi + out=$(${mysqlshell} -e "select 1;" | head -n1 | awk '{print$1}') + log "INFO" "Attempt $i: Pinging '$report_host' has returned: '$out'...................................." + if [[ "$out" == "1" ]]; then + break + fi + log "INFO" "Pinging '$report_host' has returned: '$out' (pid=$pid alive, restarts=$restarts)" + echo -n . + sleep 1 + done + + log "INFO" "mysql daemon is ready to use......." # Set read-only immediately after MySQL starts to prevent any external # process from writing local GTIDs before the node joins the cluster. From 025afbd236d74555b40e066af108baa05a590135 Mon Sep 17 00:00:00 2001 From: SK Ali Arman Date: Thu, 30 Apr 2026 13:34:41 +0600 Subject: [PATCH 6/6] update setup.txt craetion Signed-off-by: SK Ali Arman --- scripts/run.sh | 5 +++- scripts/run_innodb.sh | 59 +++++++++++++++++++++++-------------------- 2 files changed, 35 insertions(+), 29 deletions(-) diff --git a/scripts/run.sh b/scripts/run.sh index 870594d..bf269d4 100755 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -606,6 +606,8 @@ install_group_replication_plugin install_clone_plugin while true; do + echo "running">/scripts/setup.txt + log "INFO" "creating setup.txt file" kill -0 $pid exit="$?" if [[ "$exit" == "0" ]]; then @@ -641,7 +643,8 @@ while true; do join_by_clone fi joining_for_first_time=0 - log "INFO" "waiting for mysql process id = $pid" + log "INFO" "removing setup.txt file" rm -rf /scripts/setup.txt + log "INFO" "waiting for mysql process id = $pid" wait $pid done diff --git a/scripts/run_innodb.sh b/scripts/run_innodb.sh index ab03d62..ee4bf2f 100755 --- a/scripts/run_innodb.sh +++ b/scripts/run_innodb.sh @@ -391,6 +391,8 @@ function check_mysqld_alive() { } while true; do + echo "running">/scripts/setup.txt + log "INFO" "creating setup.txt file" check_mysqld_alive if [[ "$mysqld_alive" == "1" ]]; then echo "mysqld process is running" @@ -419,42 +421,43 @@ while true; do if [ ! -f "/scripts/signal.txt" ]; then log "INFO" "No signal to execute — node already joined via external reboot" else - desired_func=$(cat /scripts/signal.txt) - rm -rf /scripts/signal.txt - log "INFO" "going to execute $desired_func" + desired_func=$(cat /scripts/signal.txt) + rm -rf /scripts/signal.txt + log "INFO" "going to execute $desired_func" - if [[ $desired_func == "create_cluster" ]]; then - create_cluster - fi + if [[ $desired_func == "create_cluster" ]]; then + create_cluster + fi - if [[ $desired_func == "join_in_cluster" ]]; then - select_primary - join_in_cluster - check_instance_joined_in_cluster - if [[ "$joined_in_cluster" == "0" ]]; then - make_sure_instance_join_in_cluster + if [[ $desired_func == "join_in_cluster" ]]; then + select_primary + join_in_cluster + check_instance_joined_in_cluster + if [[ "$joined_in_cluster" == "0" ]]; then + make_sure_instance_join_in_cluster + fi fi - fi - if [[ $desired_func == "rejoin_in_cluster" ]]; then - select_primary - rejoin_in_cluster - fi - if [[ $desired_func == "join_by_clone" ]]; then - select_primary - join_by_clone - start_mysqld_in_background - wait_for_host_online "${MYSQL_ROOT_USERNAME}" "$report_host" "$MYSQL_ROOT_PASSWORD" - join_in_cluster - fi + if [[ $desired_func == "rejoin_in_cluster" ]]; then + select_primary + rejoin_in_cluster + fi + if [[ $desired_func == "join_by_clone" ]]; then + select_primary + join_by_clone + start_mysqld_in_background + wait_for_host_online "${MYSQL_ROOT_USERNAME}" "$report_host" "$MYSQL_ROOT_PASSWORD" + join_in_cluster + fi - if [[ $desired_func == "reboot_from_complete_outage" ]]; then - reboot_from_completeOutage - fi + if [[ $desired_func == "reboot_from_complete_outage" ]]; then + reboot_from_completeOutage + fi fi - log "INFO" "waiting for mysql process id = $pid" + log "INFO" "removing setup.txt file" rm -rf /scripts/signal.txt rm -rf /scripts/setup.txt + log "INFO" "waiting for mysql process id = $pid" wait $pid done