From b6d903afb021acc69e5ae25bdce60de2048478fd Mon Sep 17 00:00:00 2001 From: LZD-PratyushBhatt Date: Sat, 10 May 2025 20:00:35 +0530 Subject: [PATCH 01/10] Starter code for MM stacking --- .../java/org/apache/helix/HelixAdmin.java | 10 + .../apache/helix/manager/zk/ZKHelixAdmin.java | 103 +++- .../apache/helix/model/ControllerHistory.java | 9 +- .../apache/helix/model/MaintenanceSignal.java | 365 ++++++++++++++ .../TestClusterMaintenanceMode.java | 461 ++++++++++++++++++ .../org/apache/helix/mock/MockHelixAdmin.java | 6 + .../resources/helix/ClusterAccessor.java | 19 +- 7 files changed, 942 insertions(+), 31 deletions(-) diff --git a/helix-core/src/main/java/org/apache/helix/HelixAdmin.java b/helix-core/src/main/java/org/apache/helix/HelixAdmin.java index 5c2ef10f20..f0cad61bac 100644 --- a/helix-core/src/main/java/org/apache/helix/HelixAdmin.java +++ b/helix-core/src/main/java/org/apache/helix/HelixAdmin.java @@ -422,6 +422,16 @@ void autoEnableMaintenanceMode(String clusterName, boolean enabled, String reaso void manuallyEnableMaintenanceMode(String clusterName, boolean enabled, String reason, Map customFields); + /** + * Enable maintenance mode via automation systems (like HelixACM). To be called by automation services. + * @param clusterName + * @param enabled + * @param reason + * @param customFields user-specified KV mappings to be stored in the ZNode + */ + void automationEnableMaintenanceMode(String clusterName, boolean enabled, String reason, + Map customFields); + /** * Check specific cluster is in maintenance mode or not * @param clusterName the cluster name diff --git a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java index 0d72ac4aaa..b2e53b12bc 100644 --- a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java +++ b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java @@ -948,8 +948,15 @@ public void enableMaintenanceMode(String clusterName, boolean enabled) { public boolean isInMaintenanceMode(String clusterName) { HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); - return accessor.getBaseDataAccessor() - .exists(keyBuilder.maintenance().getPath(), AccessOption.PERSISTENT); + + MaintenanceSignal signal = accessor.getProperty(keyBuilder.maintenance()); + + if (signal == null) { + return false; + } + + // Check if there are any maintenance reasons active + return signal.hasMaintenanceReasons(); } @Override @@ -1182,6 +1189,14 @@ public void manuallyEnableMaintenanceMode(String clusterName, boolean enabled, S MaintenanceSignal.TriggeringEntity.USER); } + @Override + public void automationEnableMaintenanceMode(String clusterName, boolean enabled, String reason, + Map customFields) { + processMaintenanceMode(clusterName, enabled, reason, + MaintenanceSignal.AutoTriggerReason.NOT_APPLICABLE, customFields, + MaintenanceSignal.TriggeringEntity.AUTOMATION); + } + /** * Helper method for enabling/disabling maintenance mode. * @param clusterName @@ -1201,38 +1216,75 @@ private void processMaintenanceMode(String clusterName, final boolean enabled, triggeringEntity == MaintenanceSignal.TriggeringEntity.CONTROLLER ? "automatically" : "manually", enabled ? "enters" : "exits", reason == null ? "NULL" : reason); final long currentTime = System.currentTimeMillis(); + + MaintenanceSignal maintenanceSignal = accessor.getProperty(keyBuilder.maintenance()); if (!enabled) { - // Exit maintenance mode - accessor.removeProperty(keyBuilder.maintenance()); + // Exit maintenance mode for this specific triggering entity + + if (maintenanceSignal != null) { + // If a specific actor is exiting maintenance mode + boolean removed = maintenanceSignal.removeMaintenanceReason(triggeringEntity); + + if (removed) { + // If there are still reasons for maintenance mode, update the ZNode + if (maintenanceSignal.hasMaintenanceReasons()) { + if (!accessor.setProperty(keyBuilder.maintenance(), maintenanceSignal)) { + throw new HelixException("Failed to update maintenance signal!"); + } + } else { + // If this was the last reason, remove the maintenance ZNode entirely + accessor.removeProperty(keyBuilder.maintenance()); + } + } else { + // If old client, just remove the maintenance node entirely + accessor.removeProperty(keyBuilder.maintenance()); + } + } } else { // Enter maintenance mode - MaintenanceSignal maintenanceSignal = new MaintenanceSignal(MAINTENANCE_ZNODE_ID); + + if (maintenanceSignal == null) { + // Create a new maintenance signal if it doesn't exist + maintenanceSignal = new MaintenanceSignal(MAINTENANCE_ZNODE_ID); + } + + // First check for potential old client updates (simpleFields different than listField entries) + // This MUST happen before we modify any simpleFields to avoid overwriting important data needed for reconciliation + maintenanceSignal.reconcileMaintenanceData(); + + // Add the reason to the maintenance signal if (reason != null) { maintenanceSignal.setReason(reason); } + maintenanceSignal.setTimestamp(currentTime); maintenanceSignal.setTriggeringEntity(triggeringEntity); - switch (triggeringEntity) { - case CONTROLLER: - // autoEnable - maintenanceSignal.setAutoTriggerReason(internalReason); - break; - case USER: - case UNKNOWN: - // manuallyEnable - if (customFields != null && !customFields.isEmpty()) { - // Enter all custom fields provided by the user - Map simpleFields = maintenanceSignal.getRecord().getSimpleFields(); - for (Map.Entry entry : customFields.entrySet()) { - if (!simpleFields.containsKey(entry.getKey())) { - simpleFields.put(entry.getKey(), entry.getValue()); - } - } + + // For CONTROLLER type, set the auto trigger reason + if (triggeringEntity == MaintenanceSignal.TriggeringEntity.CONTROLLER) { + maintenanceSignal.setAutoTriggerReason(internalReason); + } else if (customFields != null && !customFields.isEmpty()) { + // For other types, add custom fields if provided + Map simpleFields = maintenanceSignal.getRecord().getSimpleFields(); + for (Map.Entry entry : customFields.entrySet()) { + if (!simpleFields.containsKey(entry.getKey())) { + simpleFields.put(entry.getKey(), entry.getValue()); } - break; + } } - if (!accessor.createMaintenance(maintenanceSignal)) { - throw new HelixException("Failed to create maintenance signal!"); + + // Add this reason to the multi-actor maintenance reasons list + maintenanceSignal.addMaintenanceReason(reason, currentTime, triggeringEntity); + + // Create or update the maintenance node + if (accessor.getProperty(keyBuilder.maintenance()) == null) { + if (!accessor.createMaintenance(maintenanceSignal)) { + throw new HelixException("Failed to create maintenance signal!"); + } + } else { + if (!accessor.setProperty(keyBuilder.maintenance(), maintenanceSignal)) { + throw new HelixException("Failed to update maintenance signal!"); + } } } @@ -1246,7 +1298,8 @@ private void processMaintenanceMode(String clusterName, final boolean enabled, } return new ControllerHistory(oldRecord) .updateMaintenanceHistory(enabled, reason, currentTime, internalReason, - customFields, triggeringEntity); + customFields, triggeringEntity, + isInMaintenanceMode(clusterName)); } catch (IOException e) { logger.error("Failed to update maintenance history! Exception: {}", e); return oldRecord; diff --git a/helix-core/src/main/java/org/apache/helix/model/ControllerHistory.java b/helix-core/src/main/java/org/apache/helix/model/ControllerHistory.java index 47b0958d9c..89289c3fa5 100644 --- a/helix-core/src/main/java/org/apache/helix/model/ControllerHistory.java +++ b/helix-core/src/main/java/org/apache/helix/model/ControllerHistory.java @@ -56,8 +56,8 @@ private enum MaintenanceConfigKey { MAINTENANCE_HISTORY, OPERATION_TYPE, DATE, - REASON - + REASON, + IN_MAINTENANCE_AFTER_OPERATION } private enum ManagementModeConfigKey { @@ -180,10 +180,11 @@ public ZNRecord updateManagementModeHistory(String controller, ClusterManagement * @param internalReason * @param customFields * @param triggeringEntity + * @param inMaintenanceAfterOperation whether the cluster is still in maintenance mode after this operation */ public ZNRecord updateMaintenanceHistory(boolean enabled, String reason, long currentTime, MaintenanceSignal.AutoTriggerReason internalReason, Map customFields, - MaintenanceSignal.TriggeringEntity triggeringEntity) throws IOException { + MaintenanceSignal.TriggeringEntity triggeringEntity, boolean inMaintenanceAfterOperation) throws IOException { DateFormat df = new SimpleDateFormat("yyyy-MM-dd-HH:" + "mm:ss"); df.setTimeZone(TimeZone.getTimeZone("UTC")); String dateTime = df.format(new Date(currentTime)); @@ -198,6 +199,8 @@ public ZNRecord updateMaintenanceHistory(boolean enabled, String reason, long cu String.valueOf(currentTime)); maintenanceEntry.put(MaintenanceSignal.MaintenanceSignalProperty.TRIGGERED_BY.name(), triggeringEntity.name()); + maintenanceEntry.put(MaintenanceConfigKey.IN_MAINTENANCE_AFTER_OPERATION.name(), + String.valueOf(inMaintenanceAfterOperation)); if (triggeringEntity == MaintenanceSignal.TriggeringEntity.CONTROLLER) { // If auto-triggered maintenanceEntry.put(MaintenanceSignal.MaintenanceSignalProperty.AUTO_TRIGGER_REASON.name(), diff --git a/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java b/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java index 83e0e1c604..020ee5c430 100644 --- a/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java +++ b/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java @@ -19,12 +19,23 @@ * under the License. */ +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Iterator; import org.apache.helix.zookeeper.datamodel.ZNRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.type.TypeFactory; +import java.io.IOException; /** * A ZNode that signals that the cluster is in maintenance mode. */ public class MaintenanceSignal extends PauseSignal { + private static final Logger LOG = LoggerFactory.getLogger(MaintenanceSignal.class); /** * Pre-defined fields set by Helix Controller only. @@ -40,6 +51,7 @@ public enum MaintenanceSignalProperty { */ public enum TriggeringEntity { CONTROLLER, + AUTOMATION, // triggered by automation systems (like HelixACM) USER, // manually triggered by user UNKNOWN } @@ -57,6 +69,11 @@ public enum AutoTriggerReason { NOT_APPLICABLE // Not triggered automatically or automatically exiting maintenance mode } + /** + * Constant for the name of the reasons list field + */ + private static final String REASONS_LIST_FIELD = "reasons"; + public MaintenanceSignal(String id) { super(id); } @@ -112,4 +129,352 @@ public void setTimestamp(long timestamp) { public long getTimestamp() { return _record.getLongField(MaintenanceSignalProperty.TIMESTAMP.name(), -1); } + + /** + * Add a new maintenance reason (or update an existing one if the triggering entity already has a reason). + * + * @param reason The reason for maintenance + * @param timestamp The timestamp when maintenance was triggered + * @param triggeringEntity The entity that triggered maintenance + */ + public void addMaintenanceReason(String reason, long timestamp, TriggeringEntity triggeringEntity) { + LOG.info("Adding maintenance reason for entity: {}, reason: {}, timestamp: {}", + triggeringEntity, reason, timestamp); + + // The triggering entity is our unique key - we will overwrite any existing entry with this entity + String triggerEntityStr = triggeringEntity.name(); + + // Get the current list of reasons + List> reasons = getMaintenanceReasons(); + LOG.debug("Before addition: Reasons list contains {} entries", reasons.size()); + + // Check if we already have a reason for this entity + boolean found = false; + for (Map entry : reasons) { + if (triggerEntityStr.equals(entry.get(MaintenanceSignalProperty.TRIGGERED_BY.name()))) { + // Update the existing entry + entry.put(PauseSignalProperty.REASON.name(), reason); + entry.put(MaintenanceSignalProperty.TIMESTAMP.name(), Long.toString(timestamp)); + found = true; + LOG.debug("Updated existing entry for entity: {}", triggeringEntity); + break; + } + } + + // If we didn't find an entry, add a new one + if (!found) { + Map newEntry = new HashMap<>(); + newEntry.put(PauseSignalProperty.REASON.name(), reason); + newEntry.put(MaintenanceSignalProperty.TIMESTAMP.name(), Long.toString(timestamp)); + newEntry.put(MaintenanceSignalProperty.TRIGGERED_BY.name(), triggerEntityStr); + reasons.add(newEntry); + LOG.debug("Added new entry for entity: {}", triggeringEntity); + } + + // Update the ZNRecord with our updated list + updateReasonsListField(reasons); + LOG.debug("After addition: Reasons list contains {} entries", reasons.size()); + + // Update all the simpleFields for backward compatibility + setReason(reason); + _record.setSimpleField("reason", reason); // Also update lowercase "reason" for complete compatibility + setTimestamp(timestamp); + setTriggeringEntity(triggeringEntity); + } + + /** + * Helper method to update the ZNRecord with the current reasons list. + * Each reason is stored as a single JSON string in the list. + * + * @param reasons The list of reason maps to store + */ + private void updateReasonsListField(List> reasons) { + List reasonsList = new ArrayList<>(); + + for (Map entry : reasons) { + String jsonString = convertMapToJsonString(entry); + if (!jsonString.isEmpty()) { + reasonsList.add(jsonString); + } + } + + _record.setListField(REASONS_LIST_FIELD, reasonsList); + } + + /** + * Convert a map to a JSON-style string + */ + private String convertMapToJsonString(Map map) { + try { + return new ObjectMapper().writeValueAsString(map); + } catch (IOException e) { + LOG.warn("Failed to convert map to JSON string: {}", e.getMessage()); + // Fallback to a simple representation if JSON serialization fails + return ""; + } + } + + /** + * Get all maintenance reasons currently active. + * + * @return List of maintenance reasons as maps + */ + public List> getMaintenanceReasons() { + List> reasons = new ArrayList<>(); + List reasonsList = _record.getListField(REASONS_LIST_FIELD); + + if (reasonsList == null || reasonsList.isEmpty()) { + // If the list doesn't exist but simple fields do, add the simple fields as the first reason + String simpleReason = getReason(); + if (simpleReason != null) { + Map entry = new HashMap<>(); + entry.put(PauseSignalProperty.REASON.name(), simpleReason); + entry.put(MaintenanceSignalProperty.TIMESTAMP.name(), String.valueOf(getTimestamp())); + entry.put(MaintenanceSignalProperty.TRIGGERED_BY.name(), getTriggeringEntity().name()); + + reasons.add(entry); + } + } else { + // Parse each entry as a JSON-formatted string + for (String entryStr : reasonsList) { + Map entry = parseJsonStyleEntry(entryStr); + if (!entry.isEmpty()) { + reasons.add(entry); + } + } + } + + return reasons; + } + + /** + * Parse an entry string in JSON format into a map + */ + private Map parseJsonStyleEntry(String entryStr) { + Map map = new HashMap<>(); + try { + return new ObjectMapper().readValue(entryStr, + TypeFactory.defaultInstance().constructMapType(HashMap.class, String.class, String.class)); + } catch (IOException e) { + LOG.warn("Failed to parse JSON entry: {}, error: {}", entryStr, e.getMessage()); + } + return map; + } + + /** + * Remove a maintenance reason by triggering entity. + * + * @param triggeringEntity The entity whose reason should be removed + * @return true if a reason was removed, false otherwise + */ + public boolean removeMaintenanceReason(TriggeringEntity triggeringEntity) { + LOG.info("Removing maintenance reason for entity: {}", triggeringEntity); + + // Get the current list of reasons + List> reasons = getMaintenanceReasons(); + int originalSize = reasons.size(); + + LOG.debug("Before removal: Reasons list contains {} entries", reasons.size()); + + // Create a new list to avoid modifying the original during iteration + List> updatedReasons = new ArrayList<>(); + String targetEntity = triggeringEntity.name(); + + // Only keep reasons that don't match the triggering entity + for (Map entry : reasons) { + String entryEntity = entry.get(MaintenanceSignalProperty.TRIGGERED_BY.name()); + if (!targetEntity.equals(entryEntity)) { + updatedReasons.add(entry); + } else { + LOG.debug("Removing entry with reason: {} for entity: {}", + entry.get(PauseSignalProperty.REASON.name()), entryEntity); + } + } + + boolean removed = updatedReasons.size() < originalSize; + LOG.debug("After removal: Reasons list contains {} entries", updatedReasons.size()); + + if (removed) { + // Update the ZNRecord with the new reasons + updateReasonsListField(updatedReasons); + + // Update the simpleFields with the most recent reason (for backward compatibility) + if (!updatedReasons.isEmpty()) { + // Sort by timestamp in descending order to get the most recent + updatedReasons.sort((r1, r2) -> { + long t1 = Long.parseLong(r1.get(MaintenanceSignalProperty.TIMESTAMP.name())); + long t2 = Long.parseLong(r2.get(MaintenanceSignalProperty.TIMESTAMP.name())); + return Long.compare(t2, t1); + }); + + // Update simple fields with the most recent + Map mostRecent = updatedReasons.get(0); + String newReason = mostRecent.get(PauseSignalProperty.REASON.name()); + long newTimestamp = Long.parseLong(mostRecent.get(MaintenanceSignalProperty.TIMESTAMP.name())); + TriggeringEntity newEntity = TriggeringEntity.valueOf( + mostRecent.get(MaintenanceSignalProperty.TRIGGERED_BY.name())); + + LOG.info("Updated to most recent reason: {}, entity: {}, timestamp: {}", + newReason, newEntity, newTimestamp); + + setReason(newReason); + _record.setSimpleField("reason", newReason); // Also update lowercase "reason" for complete compatibility + setTimestamp(newTimestamp); + setTriggeringEntity(newEntity); + } else { + // If no reasons left, clear all fields + LOG.info("No maintenance reasons remaining, clearing all fields"); + setReason(null); + _record.setSimpleField("reason", null); + setTimestamp(System.currentTimeMillis()); + setTriggeringEntity(TriggeringEntity.UNKNOWN); + } + } else { + LOG.info("No matching maintenance reason found for entity: {}", triggeringEntity); + } + + return removed; + } + + /** + * Check if there are any active maintenance reasons. + * + * @return true if there are any reasons for maintenance, false otherwise + */ + public boolean hasMaintenanceReasons() { + return !getMaintenanceReasons().isEmpty(); + } + + /** + * Update the maintenance reason list to ensure all data is consistent. + * This is used when handling potential inconsistencies between simpleFields and + * the reasons list, which can happen with old clients. + * + * @return true if list was updated, false if no change was needed + */ + public boolean reconcileMaintenanceData() { + // Get the reason from simple fields + String simpleReason = getReason(); + + // If simpleFields are null or empty, nothing to reconcile + if (simpleReason == null) { + return false; + } + long simpleTimestamp = getTimestamp(); + TriggeringEntity simpleTriggeringEntity = getTriggeringEntity(); + + // Look at the raw reasons list - don't use getMaintenanceReasons() as it already + // synthesizes entries from simple fields which would create a circular dependency + List rawReasonsList = _record.getListField(REASONS_LIST_FIELD); + List> parsedReasons = new ArrayList<>(); + + // If there's no list field at all, we'll need to create one + boolean needsUpdate = (rawReasonsList == null); + + if (rawReasonsList != null) { + // Parse each entry to check if our simple fields already have a corresponding entry + for (String entryStr : rawReasonsList) { + Map entry = parseJsonStyleEntry(entryStr); + if (!entry.isEmpty()) { + parsedReasons.add(entry); + } + } + } else { + // Create a new list + rawReasonsList = new ArrayList<>(); + } + + // Check if the triggering entity from simple fields already has an entry + boolean alreadyPresent = false; + for (Map entry : parsedReasons) { + if (simpleTriggeringEntity.name().equals(entry.get(MaintenanceSignalProperty.TRIGGERED_BY.name()))) { + // Check if timestamps are different + long entryTimestamp = Long.parseLong(entry.get(MaintenanceSignalProperty.TIMESTAMP.name())); + if (simpleTimestamp > entryTimestamp) { + // If simple field timestamp is newer, update the entry + entry.put(PauseSignalProperty.REASON.name(), simpleReason); + entry.put(MaintenanceSignalProperty.TIMESTAMP.name(), Long.toString(simpleTimestamp)); + needsUpdate = true; + } + alreadyPresent = true; + break; + } + } + + // If simple fields exist but not in the reasons list, add them + if (!alreadyPresent) { + Map newEntry = new HashMap<>(); + newEntry.put(PauseSignalProperty.REASON.name(), simpleReason); + newEntry.put(MaintenanceSignalProperty.TIMESTAMP.name(), Long.toString(simpleTimestamp)); + newEntry.put(MaintenanceSignalProperty.TRIGGERED_BY.name(), simpleTriggeringEntity.name()); + + parsedReasons.add(newEntry); + needsUpdate = true; + + LOG.debug("Added missing reason for {} to reasons list during reconciliation", + simpleTriggeringEntity); + } + + if (needsUpdate) { + // Update the ZNRecord with the new reasons + updateReasonsListField(parsedReasons); + LOG.debug("Updated reasons list after reconciliation, now contains {} entries", + parsedReasons.size()); + return true; + } + + return false; + } + + /** + * Checks if there is a maintenance reason from a specific triggering entity. + * + * @param triggeringEntity The entity to check + * @return true if there is a maintenance reason from this entity + */ + public boolean hasMaintenanceReason(TriggeringEntity triggeringEntity) { + List> reasons = getMaintenanceReasons(); + for (Map entry : reasons) { + if (triggeringEntity.name().equals(entry.get(MaintenanceSignalProperty.TRIGGERED_BY.name()))) { + return true; + } + } + return false; + } + + /** + * Gets the maintenance reason details for a specific triggering entity. + * + * @param triggeringEntity The entity to get reason details for + * @return Map containing reason details, or null if not found + */ + public Map getMaintenanceReasonDetails(TriggeringEntity triggeringEntity) { + List> reasons = getMaintenanceReasons(); + for (Map entry : reasons) { + if (triggeringEntity.name().equals(entry.get(MaintenanceSignalProperty.TRIGGERED_BY.name()))) { + return entry; + } + } + return null; + } + + /** + * Gets the number of active maintenance reasons. + * + * @return The count of active maintenance reasons + */ + public int getMaintenanceReasonsCount() { + return getMaintenanceReasons().size(); + } + + /** + * Gets the maintenance reason from a specific triggering entity. + * + * @param triggeringEntity The entity to get reason for + * @return The reason string, or null if not found + */ + public String getMaintenanceReason(TriggeringEntity triggeringEntity) { + Map details = getMaintenanceReasonDetails(triggeringEntity); + return details != null ? details.get(PauseSignalProperty.REASON.name()) : null; + } } diff --git a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java index 6654098f8b..d7c0b56e2d 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java +++ b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java @@ -20,7 +20,9 @@ */ import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; import javax.management.MalformedObjectNameException; import javax.management.ObjectName; @@ -40,11 +42,13 @@ import org.apache.helix.model.ExternalView; import org.apache.helix.model.IdealState; import org.apache.helix.model.MaintenanceSignal; +import org.apache.helix.model.PauseSignal; import org.apache.helix.monitoring.mbeans.MonitorDomainNames; import org.testng.Assert; import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; +import org.apache.helix.zookeeper.datamodel.ZNRecord; import static org.apache.helix.monitoring.mbeans.ClusterStatusMonitor.CLUSTER_DN_KEY; @@ -439,4 +443,461 @@ private static Map convertStringToMap(String value) throws IOExc return new ObjectMapper().readValue(value, TypeFactory.defaultInstance().constructMapType(HashMap.class, String.class, String.class)); } + + /** + * Test that automation triggered maintenance mode works correctly + * and multi-actor maintenance mode requires all actors to exit + */ +// @Test(dependsOnMethods = "testMaintenanceHistory") + @Test + public void testAutomationMaintenanceMode() throws Exception { + // Make sure we're not in maintenance mode at the start + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + + // Put cluster in maintenance mode via automation + Map customFields = ImmutableMap.of("TICKET", "AUTO-123", "SOURCE", "HelixACM"); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "Automation maintenance", customFields); + + // Verify we are in maintenance mode with the right attributes + MaintenanceSignal maintenanceSignal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(maintenanceSignal); + Assert.assertEquals(maintenanceSignal.getTriggeringEntity(), + MaintenanceSignal.TriggeringEntity.AUTOMATION); + + // Verify custom fields were set + for (Map.Entry entry : customFields.entrySet()) { + Assert.assertEquals(maintenanceSignal.getRecord().getSimpleField(entry.getKey()), + entry.getValue()); + } + + // Manually put the cluster in maintenance too - this should keep both reasons + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "User maintenance", null); + + // Verify we have both reasons + maintenanceSignal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertTrue(maintenanceSignal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + Assert.assertTrue(maintenanceSignal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertEquals(maintenanceSignal.getMaintenanceReasonsCount(), 2); + + // Exit maintenance mode for USER only + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + + // Verify we're still in maintenance mode with only AUTOMATION reason + maintenanceSignal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(maintenanceSignal); + Assert.assertFalse(maintenanceSignal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(maintenanceSignal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + Assert.assertEquals(maintenanceSignal.getMaintenanceReasonsCount(), 1); + + // Exit maintenance mode for AUTOMATION + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + + // Verify we're now out of maintenance mode + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + } + + /** + * Test that removing a maintenance reason doesn't add duplicate entries in the reasons list + */ +// @Test(dependsOnMethods = "testAutomationMaintenanceMode") + @Test + public void testRemoveMaintenanceReasonNoDuplicates() throws Exception { + // Make sure we start clean + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + + // First put the cluster in maintenance mode via USER + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "User entry", null); + + // Then add AUTOMATION reason + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "Automation entry", null); + + // Verify we have both reasons + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + + // Remove AUTOMATION reason + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + + // Verify we only have USER reason and no duplicate entries + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + + // Verify the reasons list field to ensure no duplicates + List reasonsList = signal.getRecord().getListField("reasons"); + Assert.assertEquals(reasonsList.size(), 1, "Should have exactly 1 entry in reasons list"); + + // Verify the simple fields are correct + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + Assert.assertEquals(signal.getReason(), "User entry"); + + // Remove USER reason + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + + // Verify we're out of maintenance mode + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + } + + /** + * Test that the code can handle an old client that writes a MaintenanceSignal + * without using the multi-actor maintenance API (no reasons list). + */ + @Test + public void testLegacyClientCompatibility() throws Exception { + // Simulate an old client by creating a MaintenanceSignal with only simple fields + ZNRecord record = new ZNRecord("maintenance"); + // Add just the simple fields like an old client would + record.setSimpleField(PauseSignal.PauseSignalProperty.REASON.name(), "Legacy client reason"); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TRIGGERED_BY.name(), + MaintenanceSignal.TriggeringEntity.USER.name()); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TIMESTAMP.name(), + String.valueOf(System.currentTimeMillis())); + + // Write it directly to ZK + _dataAccessor.setProperty(_keyBuilder.maintenance(), new MaintenanceSignal(record)); + + // Verify the maintenance signal is there + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + Assert.assertEquals(signal.getReason(), "Legacy client reason"); + + // Verify that the list field does not exist yet + List reasonsListBefore = signal.getRecord().getListField("reasons"); + Assert.assertNull(reasonsListBefore, "Should not have reasons list field yet"); + + // Explicitly call reconcileMaintenanceData + boolean updated = signal.reconcileMaintenanceData(); + Assert.assertTrue(updated, "Should have updated the ZNode during reconciliation"); + + // Verify the reasons list was created + List reasonsListAfter = signal.getRecord().getListField("reasons"); + Assert.assertNotNull(reasonsListAfter, "Should have created reasons list field"); + Assert.assertEquals(reasonsListAfter.size(), 1, "Should have added exactly one entry"); + + // Verify the entry content contains all simple fields + String entryString = reasonsListAfter.get(0); + Assert.assertTrue(entryString.contains("Legacy client reason"), + "Entry should contain the reason text"); + Assert.assertTrue(entryString.contains(MaintenanceSignal.TriggeringEntity.USER.name()), + "Entry should contain the entity"); + + // Save to ZK + _dataAccessor.setProperty(_keyBuilder.maintenance(), signal); + + // Now read it back to verify persistence + MaintenanceSignal readBackSignal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + List persistedReasonsList = readBackSignal.getRecord().getListField("reasons"); + Assert.assertEquals(persistedReasonsList.size(), 1, "Should have persisted exactly one entry"); + + // Now try to add a new reason via the new API - should work with the legacy format + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "Automation entry with legacy client", null); + + // Verify both reasons exist + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + + // Try removing the automation reason + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + + // Verify only USER reason remains + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + + // Clean up + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + } + + /** + * Test that the IN_MAINTENANCE_AFTER_OPERATION field in the history record + * is only set to false when all maintenance reasons are gone. + */ + @Test(dependsOnMethods = "testLegacyClientCompatibility") + public void testMaintenanceHistoryAfterOperationFlag() throws Exception { + // Make sure we start clean + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + + // First put the cluster in maintenance mode via USER + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "User entry2", null); + + // Verify history shows IN_MAINTENANCE_AFTER_OPERATION as true + ControllerHistory history = _dataAccessor.getProperty(_keyBuilder.controllerLeaderHistory()); + Map lastEntry = convertStringToMap( + history.getMaintenanceHistoryList().get(history.getMaintenanceHistoryList().size() - 1)); + Assert.assertEquals(lastEntry.get("OPERATION_TYPE"), "ENTER"); + Assert.assertEquals(lastEntry.get("TRIGGERED_BY"), "USER"); + Assert.assertEquals(lastEntry.get("IN_MAINTENANCE_AFTER_OPERATION"), "true"); + + // Add a second actor (AUTOMATION) + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "Automation entry2", null); + + // Verify history shows IN_MAINTENANCE_AFTER_OPERATION as true + history = _dataAccessor.getProperty(_keyBuilder.controllerLeaderHistory()); + lastEntry = convertStringToMap( + history.getMaintenanceHistoryList().get(history.getMaintenanceHistoryList().size() - 1)); + Assert.assertEquals(lastEntry.get("OPERATION_TYPE"), "ENTER"); + Assert.assertEquals(lastEntry.get("TRIGGERED_BY"), "AUTOMATION"); + Assert.assertEquals(lastEntry.get("IN_MAINTENANCE_AFTER_OPERATION"), "true"); + + // Remove AUTOMATION actor, but we should still be in maintenance mode because USER remains + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + + // Verify we're still in maintenance mode with a single actor + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + + // Verify history shows IN_MAINTENANCE_AFTER_OPERATION as true + history = _dataAccessor.getProperty(_keyBuilder.controllerLeaderHistory()); + lastEntry = convertStringToMap( + history.getMaintenanceHistoryList().get(history.getMaintenanceHistoryList().size() - 1)); + Assert.assertEquals(lastEntry.get("OPERATION_TYPE"), "EXIT"); + Assert.assertEquals(lastEntry.get("TRIGGERED_BY"), "AUTOMATION"); + Assert.assertEquals(lastEntry.get("IN_MAINTENANCE_AFTER_OPERATION"), "true"); + + // Remove USER actor, which should get us out of maintenance mode + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + + // Verify we're out of maintenance mode + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + + // Verify history shows IN_MAINTENANCE_AFTER_OPERATION as false + history = _dataAccessor.getProperty(_keyBuilder.controllerLeaderHistory()); + lastEntry = convertStringToMap( + history.getMaintenanceHistoryList().get(history.getMaintenanceHistoryList().size() - 1)); + Assert.assertEquals(lastEntry.get("OPERATION_TYPE"), "EXIT"); + Assert.assertEquals(lastEntry.get("TRIGGERED_BY"), "USER"); + Assert.assertEquals(lastEntry.get("IN_MAINTENANCE_AFTER_OPERATION"), "false"); + } + + /** + * Set up the initial state and mock components for maintenance mode tests. + * This ensures maintenance mode doesn't get automatically exited. + */ + private void setupMaintenanceModeTest() throws Exception { + // Set cluster config to ensure auto-exit conditions are never met + ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); + clusterConfig.setMaxOfflineInstancesAllowed(2); + clusterConfig.setNumOfflineInstancesForAutoExit(1); + _manager.getConfigAccessor().setClusterConfig(CLUSTER_NAME, clusterConfig); + + // Kill 3 instances + for (int i = 0; i < 3; i++) { + _participants[i].syncStop(); + } + TestHelper.verify(() -> _dataAccessor.getChildNames(_keyBuilder.liveInstances()).isEmpty(), 2000L); + + // Check that the cluster is in maintenance + MaintenanceSignal maintenanceSignal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(maintenanceSignal); + } + + /** + * Helper method to set up a common scenario for maintenance mode tests: + * 1. Controller enters maintenance mode + * 2. User A enters maintenance mode + * 3. User B enters maintenance mode (overrides User A) + * 4. Automation enters maintenance mode + * 5. Old client enters maintenance mode (simple fields only) + * + */ + private void setupMultiActorMaintenanceScenario() throws Exception { + // Set up the maintenance mode test environment + setupMaintenanceModeTest(); + + // Verify maintenance signal with CONTROLLER reason only + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); + + // Step 2: USER (UserA) puts the cluster into MM (t2) + Thread.sleep(10); // Ensure different timestamps + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, "reason_B", null); + + // Verify maintenance signal has both reasons + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + + // Step 3: USER (UserB) puts the cluster into MM (t3) - overrides UserA's entry + Thread.sleep(10); + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, "reason_C", null); + + // Verify maintenance signal still has same number of reasons but UserB's reason replaced UserA's + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), "reason_C"); + + // Step 4: AUTOMATION (HelixACM) puts the cluster into MM (t4) + Thread.sleep(10); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, "reason_D", null); + + // Verify maintenance signal has all three reasons + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 3); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + + // Step 5: USER (Old Client) enters cluster into MM (t5) + // Simulate old client by directly creating a MaintenanceSignal with simple fields only + Thread.sleep(10); + ZNRecord record = new ZNRecord("maintenance"); + record.setSimpleField(PauseSignal.PauseSignalProperty.REASON.name(), "reason_E"); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TIMESTAMP.name(), + String.valueOf(System.currentTimeMillis())); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TRIGGERED_BY.name(), + MaintenanceSignal.TriggeringEntity.USER.name()); + + // Write directly to ZK + _dataAccessor.updateProperty(_keyBuilder.maintenance(), new MaintenanceSignal(record)); + + // Verify maintenance signal has updated simple fields but listField still has old USER entry + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + Assert.assertEquals(signal.getReason(), "reason_E"); + + // Verify reasons list still has original 3 entries (not updated by old client) + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 3); + } + + /** + * Test Case A: New clients interaction where each actor enters and exits properly + * 1. Each actor (User, Automation, Controller) enters maintenance mode + * 2. Each actor exits maintenance mode in sequence + * 3. Verify maintenance flags in history records + */ + @Test + public void testMultiActorMaintenanceModeExitSequence() throws Exception { + // Set up the initial state with all actors having entered maintenance mode + setupMultiActorMaintenanceScenario(); + + // Step 6A: USER (New client) exits MM + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + + // Verify USER reason is gone, but CONTROLLER and AUTOMATION remain + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + + // Verify in history that we're still in maintenance + ControllerHistory history = _dataAccessor.getProperty(_keyBuilder.controllerLeaderHistory()); + Map lastEntry = convertStringToMap( + history.getMaintenanceHistoryList().get(history.getMaintenanceHistoryList().size() - 1)); + Assert.assertEquals(lastEntry.get("OPERATION_TYPE"), "EXIT"); + Assert.assertEquals(lastEntry.get("TRIGGERED_BY"), "USER"); + Assert.assertEquals(lastEntry.get("IN_MAINTENANCE_AFTER_OPERATION"), "true"); + + // Step 7A: AUTOMATION exits MM + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + + // Verify AUTOMATION reason is gone, only CONTROLLER remains + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); + Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + + // Verify in history that we're still in maintenance + history = _dataAccessor.getProperty(_keyBuilder.controllerLeaderHistory()); + lastEntry = convertStringToMap( + history.getMaintenanceHistoryList().get(history.getMaintenanceHistoryList().size() - 1)); + Assert.assertEquals(lastEntry.get("OPERATION_TYPE"), "EXIT"); + Assert.assertEquals(lastEntry.get("TRIGGERED_BY"), "AUTOMATION"); + Assert.assertEquals(lastEntry.get("IN_MAINTENANCE_AFTER_OPERATION"), "true"); + + // Step 8A: CONTROLLER exits MM + _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode( + CLUSTER_NAME, false, null, MaintenanceSignal.AutoTriggerReason.NOT_APPLICABLE); + + // Verify we're out of maintenance mode + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + + // Verify in history that we're no longer in maintenance + history = _dataAccessor.getProperty(_keyBuilder.controllerLeaderHistory()); + lastEntry = convertStringToMap( + history.getMaintenanceHistoryList().get(history.getMaintenanceHistoryList().size() - 1)); + Assert.assertEquals(lastEntry.get("OPERATION_TYPE"), "EXIT"); + Assert.assertEquals(lastEntry.get("TRIGGERED_BY"), "CONTROLLER"); + Assert.assertEquals(lastEntry.get("IN_MAINTENANCE_AFTER_OPERATION"), "false"); + } + + /** + * Test Case B: Old client enters, new clients reconcile during future operations + * 1. Old client enters maintenance mode without updating the reasons list + * 2. New client enters maintenance mode and reconciles the list + * 3. All actors exit in sequence + */ + @Test + public void testMultiActorMaintenanceModeReconciliation() throws Exception { + // Set up the initial state with all actors having entered maintenance mode + setupMultiActorMaintenanceScenario(); + + // Step 6B: AUTOMATION enters MM again - should reconcile the old client's USER update + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, "reason_F", null); + + // Verify signal has reconciled the old client's USER update with the timestamp + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 3); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + + // Verify reason is now "reason_E" + Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), "reason_E"); + + // Exit all actors in sequence + _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode( + CLUSTER_NAME, false, null, MaintenanceSignal.AutoTriggerReason.NOT_APPLICABLE); + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + + // Verify we're out of maintenance mode + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + } + + /** + * Test Case C: Old client exits maintenance mode while other actors still have reasons + * 1. Old client bypasses the normal API and just deletes the maintenance node + * 2. Verify the cluster exits maintenance mode completely + */ + @Test + public void testMultiActorMaintenanceModeOldClientExit() throws Exception { + // Set up the initial state with all actors having entered maintenance mode + setupMultiActorMaintenanceScenario(); + + // Step 6C: USER (old client) exits MM - simulating old client removing entire node + _dataAccessor.removeProperty(_keyBuilder.maintenance()); + + // Verify we're out of maintenance mode completely + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + } } diff --git a/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java b/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java index 5a1a8a5bcb..4507c71060 100644 --- a/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java +++ b/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java @@ -364,6 +364,12 @@ public void manuallyEnableMaintenanceMode(String clusterName, boolean enabled, S } + @Override + public void automationEnableMaintenanceMode(String clusterName, boolean enabled, String reason, + Map customFields) { + + } + @Override public boolean isInMaintenanceMode(String clusterName) { return false; diff --git a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ClusterAccessor.java b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ClusterAccessor.java index 1e752f3feb..dbe8a1a6f1 100644 --- a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ClusterAccessor.java +++ b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ClusterAccessor.java @@ -317,6 +317,8 @@ public Response updateCluster(@PathParam("clusterId") String clusterId, // Try to parse the content string. If parseable, use it as a KV mapping. Otherwise, treat it // as a REASON String Map customFieldsMap = null; + // Default to USER triggering entity + boolean isAutomationTriggered = false; try { // Try to parse content customFieldsMap = @@ -328,13 +330,24 @@ public Response updateCluster(@PathParam("clusterId") String clusterId, if ("reason".equalsIgnoreCase(entry.getKey())) { content = entry.getValue(); } + if ("isAutomation".equalsIgnoreCase(entry.getKey())) { + isAutomationTriggered = Boolean.parseBoolean(entry.getValue()); + } } } catch (Exception e) { // NOP } - helixAdmin - .manuallyEnableMaintenanceMode(clusterId, command == Command.enableMaintenanceMode, - content, customFieldsMap); + + // Choose the appropriate method based on the triggering entity + if (isAutomationTriggered) { + helixAdmin + .automationEnableMaintenanceMode(clusterId, command == Command.enableMaintenanceMode, + content, customFieldsMap); + } else { + helixAdmin + .manuallyEnableMaintenanceMode(clusterId, command == Command.enableMaintenanceMode, + content, customFieldsMap); + } break; case enableWagedRebalanceForAllResources: // Enable WAGED rebalance for all resources in the cluster From cdac22aa2d7c48136f9175c76f86c79d4d09fbca Mon Sep 17 00:00:00 2001 From: LZD-PratyushBhatt Date: Sun, 11 May 2025 12:03:04 +0530 Subject: [PATCH 02/10] Reconciling properly during exit and entry --- .../apache/helix/manager/zk/ZKHelixAdmin.java | 4 --- .../apache/helix/model/MaintenanceSignal.java | 36 +++++++++++-------- .../resources/helix/ClusterAccessor.java | 8 +++++ 3 files changed, 30 insertions(+), 18 deletions(-) diff --git a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java index b2e53b12bc..65da302066 100644 --- a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java +++ b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java @@ -1235,14 +1235,10 @@ private void processMaintenanceMode(String clusterName, final boolean enabled, // If this was the last reason, remove the maintenance ZNode entirely accessor.removeProperty(keyBuilder.maintenance()); } - } else { - // If old client, just remove the maintenance node entirely - accessor.removeProperty(keyBuilder.maintenance()); } } } else { // Enter maintenance mode - if (maintenanceSignal == null) { // Create a new maintenance signal if it doesn't exist maintenanceSignal = new MaintenanceSignal(MAINTENANCE_ZNODE_ID); diff --git a/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java b/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java index 020ee5c430..20ed4b0193 100644 --- a/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java +++ b/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java @@ -177,7 +177,6 @@ public void addMaintenanceReason(String reason, long timestamp, TriggeringEntity // Update all the simpleFields for backward compatibility setReason(reason); - _record.setSimpleField("reason", reason); // Also update lowercase "reason" for complete compatibility setTimestamp(timestamp); setTriggeringEntity(triggeringEntity); } @@ -270,10 +269,30 @@ private Map parseJsonStyleEntry(String entryStr) { public boolean removeMaintenanceReason(TriggeringEntity triggeringEntity) { LOG.info("Removing maintenance reason for entity: {}", triggeringEntity); - // Get the current list of reasons + // First reconcile data to capture any simple field updates from old clients + // that might not have updated the reasons list + reconcileMaintenanceData(); + + // Get the current list of reasons (after reconciliation) List> reasons = getMaintenanceReasons(); - int originalSize = reasons.size(); + // Check if this entity exists in the reasons list + boolean entityExists = false; + for (Map entry : reasons) { + String entryEntity = entry.get(MaintenanceSignalProperty.TRIGGERED_BY.name()); + if (triggeringEntity.name().equals(entryEntity)) { + entityExists = true; + break; + } + } + + // If the entity doesn't exist in reasons list, ignore the removal request + if (!entityExists) { + LOG.info("Entity {} doesn't have a maintenance reason entry, ignoring exit request", triggeringEntity); + return false; + } + + int originalSize = reasons.size(); LOG.debug("Before removal: Reasons list contains {} entries", reasons.size()); // Create a new list to avoid modifying the original during iteration @@ -318,16 +337,8 @@ public boolean removeMaintenanceReason(TriggeringEntity triggeringEntity) { newReason, newEntity, newTimestamp); setReason(newReason); - _record.setSimpleField("reason", newReason); // Also update lowercase "reason" for complete compatibility setTimestamp(newTimestamp); setTriggeringEntity(newEntity); - } else { - // If no reasons left, clear all fields - LOG.info("No maintenance reasons remaining, clearing all fields"); - setReason(null); - _record.setSimpleField("reason", null); - setTimestamp(System.currentTimeMillis()); - setTriggeringEntity(TriggeringEntity.UNKNOWN); } } else { LOG.info("No matching maintenance reason found for entity: {}", triggeringEntity); @@ -379,9 +390,6 @@ public boolean reconcileMaintenanceData() { parsedReasons.add(entry); } } - } else { - // Create a new list - rawReasonsList = new ArrayList<>(); } // Check if the triggering entity from simple fields already has an entry diff --git a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ClusterAccessor.java b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ClusterAccessor.java index dbe8a1a6f1..f903a26c01 100644 --- a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ClusterAccessor.java +++ b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ClusterAccessor.java @@ -338,6 +338,14 @@ public Response updateCluster(@PathParam("clusterId") String clusterId, // NOP } + // Filter out isAutomation and reason keys from customFieldsMap as they are already handled as + // TriggeringEntity and REASON + if (customFieldsMap != null) { + customFieldsMap.entrySet().removeIf(entry -> + "isAutomation".equalsIgnoreCase(entry.getKey()) || + "reason".equalsIgnoreCase(entry.getKey())); + } + // Choose the appropriate method based on the triggering entity if (isAutomationTriggered) { helixAdmin From 42ba048a19f1c24631156cada73719a89db9f9df Mon Sep 17 00:00:00 2001 From: LZD-PratyushBhatt Date: Sun, 11 May 2025 12:25:10 +0530 Subject: [PATCH 03/10] MM cleanup fix --- .../main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java | 3 ++- .../integration/controller/TestClusterMaintenanceMode.java | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java index 65da302066..b54e1d6ff8 100644 --- a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java +++ b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java @@ -1227,7 +1227,8 @@ private void processMaintenanceMode(String clusterName, final boolean enabled, if (removed) { // If there are still reasons for maintenance mode, update the ZNode - if (maintenanceSignal.hasMaintenanceReasons()) { + if (maintenanceSignal.getRecord().getListField("reasons") != null + && !maintenanceSignal.getRecord().getListField("reasons").isEmpty()) { if (!accessor.setProperty(keyBuilder.maintenance(), maintenanceSignal)) { throw new HelixException("Failed to update maintenance signal!"); } diff --git a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java index d7c0b56e2d..91ee8f8421 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java +++ b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java @@ -20,7 +20,6 @@ */ import java.io.IOException; -import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; From b4f97ea32154b59091473230d3aa7652e2d2826a Mon Sep 17 00:00:00 2001 From: LZD-PratyushBhatt Date: Sun, 11 May 2025 12:41:54 +0530 Subject: [PATCH 04/10] Add more cases for reconcilation cases --- .../TestClusterMaintenanceMode.java | 97 +++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java index 91ee8f8421..f3063ecb10 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java +++ b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java @@ -899,4 +899,101 @@ public void testMultiActorMaintenanceModeOldClientExit() throws Exception { // Verify we're out of maintenance mode completely TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); } + + /** + * Old client overrides simple fields after new clients enter MM, + * then new client exits but should not exit maintenance mode completely + * 1. AUTOMATION enters MM (new client) + * 2. USER enters MM again (old client - only updates simple fields) + * 3. AUTOMATION exits MM + * 4. Verify we're still in maintenance mode because USER reason remains + */ + @Test + public void testMultiActorMaintenanceModeOldClientOverride() throws Exception { + // Step 1: AUTOMATION enters MM (t2) + Thread.sleep(10); // Ensure different timestamps + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "AUTOMATION reason", null); + + // Verify maintenance signal has only AUTOMATION reason. + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + + // Step 2: USER enters MM (t3) via old client (only updates simple fields) + Thread.sleep(10); + long t3 = System.currentTimeMillis(); + // Simulate an old client by creating a MaintenanceSignal with only simple fields + ZNRecord record = new ZNRecord(signal.getRecord()); + record.setSimpleField(PauseSignal.PauseSignalProperty.REASON.name(), "USER old client reason"); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TIMESTAMP.name(), String.valueOf(t3)); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TRIGGERED_BY.name(), + MaintenanceSignal.TriggeringEntity.USER.name()); + // Don't update the listField - simulate old client behavior + + // Write it directly to ZK + _dataAccessor.setProperty(_keyBuilder.maintenance(), new MaintenanceSignal(record)); + + // Verify the simple fields were updated but the reasons list still has old USER entry + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + Assert.assertEquals(signal.getReason(), "USER old client reason"); + + // MaintenanceReasonsCount should still be 2 because the old client didn't update the reasons list + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + + // Step 3: AUTOMATION exits MM + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + + // Verify we're still in maintenance mode because USER reason remains + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + + // Check that simple fields were updated to USER values + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + Assert.assertEquals(signal.getReason(), "USER old client reason"); + + // Clean up + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + } + + /** + * Entity tries to exit MM without having entered it + * 1. AUTOMATION enters MM + * 2. USER tries to exit MM (shouldn't affect MM state) + * 3. Verify we're still in maintenance mode because USER wasn't an actor + * 4. AUTOMATION exits MM + * 5. Verify we're out of maintenance mode + */ + @Test + public void testMultiActorMaintenanceModeInvalidExit() throws Exception { + // Step 1: AUTOMATION enters MM + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, "AUTOMATION reason", null); + + // Verify maintenance signal with AUTOMATION reason + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + + // Step 2: USER tries to exit MM (shouldn't affect MM state) + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + + // Step 3: Verify we're still in maintenance mode + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + + // Step 4: AUTOMATION exits MM + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + + // Step 5: Verify we're out of maintenance mode + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + } } From aa0c54d2761362714b10e2fd3e9556f671f17117 Mon Sep 17 00:00:00 2001 From: LZD-PratyushBhatt Date: Sun, 11 May 2025 12:43:29 +0530 Subject: [PATCH 05/10] remove unncessary depends --- .../apache/helix/manager/zk/ZKHelixAdmin.java | 28 +++++++++++-------- .../apache/helix/model/MaintenanceSignal.java | 20 +------------ .../TestClusterMaintenanceMode.java | 2 -- .../resources/helix/ClusterAccessor.java | 3 -- 4 files changed, 18 insertions(+), 35 deletions(-) diff --git a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java index b54e1d6ff8..e486f3a99d 100644 --- a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java +++ b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java @@ -1257,23 +1257,29 @@ private void processMaintenanceMode(String clusterName, final boolean enabled, maintenanceSignal.setTimestamp(currentTime); maintenanceSignal.setTriggeringEntity(triggeringEntity); - // For CONTROLLER type, set the auto trigger reason - if (triggeringEntity == MaintenanceSignal.TriggeringEntity.CONTROLLER) { - maintenanceSignal.setAutoTriggerReason(internalReason); - } else if (customFields != null && !customFields.isEmpty()) { - // For other types, add custom fields if provided - Map simpleFields = maintenanceSignal.getRecord().getSimpleFields(); - for (Map.Entry entry : customFields.entrySet()) { - if (!simpleFields.containsKey(entry.getKey())) { - simpleFields.put(entry.getKey(), entry.getValue()); + switch (triggeringEntity) { + case CONTROLLER: + // autoEnable + maintenanceSignal.setAutoTriggerReason(internalReason); + break; + case USER: + case UNKNOWN: + // manuallyEnable + if (customFields != null && !customFields.isEmpty()) { + // Enter all custom fields provided by the user + Map simpleFields = maintenanceSignal.getRecord().getSimpleFields(); + for (Map.Entry entry : customFields.entrySet()) { + if (!simpleFields.containsKey(entry.getKey())) { + simpleFields.put(entry.getKey(), entry.getValue()); + } + } } - } + break; } // Add this reason to the multi-actor maintenance reasons list maintenanceSignal.addMaintenanceReason(reason, currentTime, triggeringEntity); - // Create or update the maintenance node if (accessor.getProperty(keyBuilder.maintenance()) == null) { if (!accessor.createMaintenance(maintenanceSignal)) { throw new HelixException("Failed to create maintenance signal!"); diff --git a/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java b/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java index 20ed4b0193..e18575e25c 100644 --- a/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java +++ b/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java @@ -141,18 +141,15 @@ public void addMaintenanceReason(String reason, long timestamp, TriggeringEntity LOG.info("Adding maintenance reason for entity: {}, reason: {}, timestamp: {}", triggeringEntity, reason, timestamp); - // The triggering entity is our unique key - we will overwrite any existing entry with this entity + // The triggering entity is our unique key - Overwrite any existing entry with this entity String triggerEntityStr = triggeringEntity.name(); - // Get the current list of reasons List> reasons = getMaintenanceReasons(); LOG.debug("Before addition: Reasons list contains {} entries", reasons.size()); - // Check if we already have a reason for this entity boolean found = false; for (Map entry : reasons) { if (triggerEntityStr.equals(entry.get(MaintenanceSignalProperty.TRIGGERED_BY.name()))) { - // Update the existing entry entry.put(PauseSignalProperty.REASON.name(), reason); entry.put(MaintenanceSignalProperty.TIMESTAMP.name(), Long.toString(timestamp)); found = true; @@ -161,7 +158,6 @@ public void addMaintenanceReason(String reason, long timestamp, TriggeringEntity } } - // If we didn't find an entry, add a new one if (!found) { Map newEntry = new HashMap<>(); newEntry.put(PauseSignalProperty.REASON.name(), reason); @@ -171,7 +167,6 @@ public void addMaintenanceReason(String reason, long timestamp, TriggeringEntity LOG.debug("Added new entry for entity: {}", triggeringEntity); } - // Update the ZNRecord with our updated list updateReasonsListField(reasons); LOG.debug("After addition: Reasons list contains {} entries", reasons.size()); @@ -208,7 +203,6 @@ private String convertMapToJsonString(Map map) { return new ObjectMapper().writeValueAsString(map); } catch (IOException e) { LOG.warn("Failed to convert map to JSON string: {}", e.getMessage()); - // Fallback to a simple representation if JSON serialization fails return ""; } } @@ -234,7 +228,6 @@ public List> getMaintenanceReasons() { reasons.add(entry); } } else { - // Parse each entry as a JSON-formatted string for (String entryStr : reasonsList) { Map entry = parseJsonStyleEntry(entryStr); if (!entry.isEmpty()) { @@ -273,10 +266,8 @@ public boolean removeMaintenanceReason(TriggeringEntity triggeringEntity) { // that might not have updated the reasons list reconcileMaintenanceData(); - // Get the current list of reasons (after reconciliation) List> reasons = getMaintenanceReasons(); - // Check if this entity exists in the reasons list boolean entityExists = false; for (Map entry : reasons) { String entryEntity = entry.get(MaintenanceSignalProperty.TRIGGERED_BY.name()); @@ -286,7 +277,6 @@ public boolean removeMaintenanceReason(TriggeringEntity triggeringEntity) { } } - // If the entity doesn't exist in reasons list, ignore the removal request if (!entityExists) { LOG.info("Entity {} doesn't have a maintenance reason entry, ignoring exit request", triggeringEntity); return false; @@ -295,7 +285,6 @@ public boolean removeMaintenanceReason(TriggeringEntity triggeringEntity) { int originalSize = reasons.size(); LOG.debug("Before removal: Reasons list contains {} entries", reasons.size()); - // Create a new list to avoid modifying the original during iteration List> updatedReasons = new ArrayList<>(); String targetEntity = triggeringEntity.name(); @@ -314,7 +303,6 @@ public boolean removeMaintenanceReason(TriggeringEntity triggeringEntity) { LOG.debug("After removal: Reasons list contains {} entries", updatedReasons.size()); if (removed) { - // Update the ZNRecord with the new reasons updateReasonsListField(updatedReasons); // Update the simpleFields with the most recent reason (for backward compatibility) @@ -326,7 +314,6 @@ public boolean removeMaintenanceReason(TriggeringEntity triggeringEntity) { return Long.compare(t2, t1); }); - // Update simple fields with the most recent Map mostRecent = updatedReasons.get(0); String newReason = mostRecent.get(PauseSignalProperty.REASON.name()); long newTimestamp = Long.parseLong(mostRecent.get(MaintenanceSignalProperty.TIMESTAMP.name())); @@ -374,8 +361,6 @@ public boolean reconcileMaintenanceData() { long simpleTimestamp = getTimestamp(); TriggeringEntity simpleTriggeringEntity = getTriggeringEntity(); - // Look at the raw reasons list - don't use getMaintenanceReasons() as it already - // synthesizes entries from simple fields which would create a circular dependency List rawReasonsList = _record.getListField(REASONS_LIST_FIELD); List> parsedReasons = new ArrayList<>(); @@ -383,7 +368,6 @@ public boolean reconcileMaintenanceData() { boolean needsUpdate = (rawReasonsList == null); if (rawReasonsList != null) { - // Parse each entry to check if our simple fields already have a corresponding entry for (String entryStr : rawReasonsList) { Map entry = parseJsonStyleEntry(entryStr); if (!entry.isEmpty()) { @@ -396,7 +380,6 @@ public boolean reconcileMaintenanceData() { boolean alreadyPresent = false; for (Map entry : parsedReasons) { if (simpleTriggeringEntity.name().equals(entry.get(MaintenanceSignalProperty.TRIGGERED_BY.name()))) { - // Check if timestamps are different long entryTimestamp = Long.parseLong(entry.get(MaintenanceSignalProperty.TIMESTAMP.name())); if (simpleTimestamp > entryTimestamp) { // If simple field timestamp is newer, update the entry @@ -424,7 +407,6 @@ public boolean reconcileMaintenanceData() { } if (needsUpdate) { - // Update the ZNRecord with the new reasons updateReasonsListField(parsedReasons); LOG.debug("Updated reasons list after reconciliation, now contains {} entries", parsedReasons.size()); diff --git a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java index f3063ecb10..8b232f39c2 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java +++ b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java @@ -447,7 +447,6 @@ private static Map convertStringToMap(String value) throws IOExc * Test that automation triggered maintenance mode works correctly * and multi-actor maintenance mode requires all actors to exit */ -// @Test(dependsOnMethods = "testMaintenanceHistory") @Test public void testAutomationMaintenanceMode() throws Exception { // Make sure we're not in maintenance mode at the start @@ -501,7 +500,6 @@ public void testAutomationMaintenanceMode() throws Exception { /** * Test that removing a maintenance reason doesn't add duplicate entries in the reasons list */ -// @Test(dependsOnMethods = "testAutomationMaintenanceMode") @Test public void testRemoveMaintenanceReasonNoDuplicates() throws Exception { // Make sure we start clean diff --git a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ClusterAccessor.java b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ClusterAccessor.java index f903a26c01..5d1b7a8545 100644 --- a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ClusterAccessor.java +++ b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ClusterAccessor.java @@ -338,15 +338,12 @@ public Response updateCluster(@PathParam("clusterId") String clusterId, // NOP } - // Filter out isAutomation and reason keys from customFieldsMap as they are already handled as - // TriggeringEntity and REASON if (customFieldsMap != null) { customFieldsMap.entrySet().removeIf(entry -> "isAutomation".equalsIgnoreCase(entry.getKey()) || "reason".equalsIgnoreCase(entry.getKey())); } - // Choose the appropriate method based on the triggering entity if (isAutomationTriggered) { helixAdmin .automationEnableMaintenanceMode(clusterId, command == Command.enableMaintenanceMode, From 68a64769b7bc96fc17caa84aa8b3c6d3b23894c7 Mon Sep 17 00:00:00 2001 From: LZD-PratyushBhatt Date: Mon, 19 May 2025 13:00:39 +0530 Subject: [PATCH 06/10] Fix the switch case --- .../src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java | 1 + .../helix/integration/controller/TestClusterMaintenanceMode.java | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java index e486f3a99d..3a1fa9f9a4 100644 --- a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java +++ b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java @@ -1263,6 +1263,7 @@ private void processMaintenanceMode(String clusterName, final boolean enabled, maintenanceSignal.setAutoTriggerReason(internalReason); break; case USER: + case AUTOMATION: case UNKNOWN: // manuallyEnable if (customFields != null && !customFields.isEmpty()) { diff --git a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java index 8b232f39c2..e3064a5e77 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java +++ b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java @@ -909,7 +909,6 @@ public void testMultiActorMaintenanceModeOldClientExit() throws Exception { @Test public void testMultiActorMaintenanceModeOldClientOverride() throws Exception { // Step 1: AUTOMATION enters MM (t2) - Thread.sleep(10); // Ensure different timestamps _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, "AUTOMATION reason", null); From b6a081b36d560a36a52f064e53c28b90a25cd49d Mon Sep 17 00:00:00 2001 From: LZD-PratyushBhatt Date: Mon, 19 May 2025 13:08:06 +0530 Subject: [PATCH 07/10] Remove unneccessary setting of simile fieldds --- .../main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java | 1 - .../main/java/org/apache/helix/model/MaintenanceSignal.java | 5 ----- 2 files changed, 6 deletions(-) diff --git a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java index 3a1fa9f9a4..6a7b1866eb 100644 --- a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java +++ b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java @@ -955,7 +955,6 @@ public boolean isInMaintenanceMode(String clusterName) { return false; } - // Check if there are any maintenance reasons active return signal.hasMaintenanceReasons(); } diff --git a/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java b/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java index e18575e25c..1ef3fe2aa2 100644 --- a/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java +++ b/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java @@ -169,11 +169,6 @@ public void addMaintenanceReason(String reason, long timestamp, TriggeringEntity updateReasonsListField(reasons); LOG.debug("After addition: Reasons list contains {} entries", reasons.size()); - - // Update all the simpleFields for backward compatibility - setReason(reason); - setTimestamp(timestamp); - setTriggeringEntity(triggeringEntity); } /** From 36830c7c59a5630dc7e3eea2472c79d41b6346a4 Mon Sep 17 00:00:00 2001 From: LZD-PratyushBhatt Date: Sun, 13 Jul 2025 13:24:57 +0530 Subject: [PATCH 08/10] Revamp the code as per the final design --- .../apache/helix/manager/zk/ZKHelixAdmin.java | 31 +- .../apache/helix/model/MaintenanceSignal.java | 125 +-- .../TestClusterMaintenanceMode.java | 950 +++++++++++------- 3 files changed, 644 insertions(+), 462 deletions(-) diff --git a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java index 6a7b1866eb..98d1ea6e9c 100644 --- a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java +++ b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java @@ -955,7 +955,11 @@ public boolean isInMaintenanceMode(String clusterName) { return false; } - return signal.hasMaintenanceReasons(); + // The cluster is in maintenance mode if the maintenance signal ZNode exists + // This includes cases where old clients have wiped listField data but simpleFields remain + // cluster should remain in maintenance mode as long as ZNode exists + return signal.hasMaintenanceReasons() || + (signal.getReason() != null && !signal.getReason().isEmpty()); } @Override @@ -1235,6 +1239,25 @@ private void processMaintenanceMode(String clusterName, final boolean enabled, // If this was the last reason, remove the maintenance ZNode entirely accessor.removeProperty(keyBuilder.maintenance()); } + } else { + // Case where triggering entity doesn't have an entry + // Note: CONTROLLER/AUTOMATION is strict no-op, USER can do administrative override + if (triggeringEntity == MaintenanceSignal.TriggeringEntity.USER) { + // USER has special privilege to force exit maintenance mode as administrative override + logger.info("USER administrative override: forcefully exiting maintenance mode for cluster {}", clusterName); + accessor.removeProperty(keyBuilder.maintenance()); + } else { + // CONTROLLER/AUTOMATION: strict no-op if their entry not found + logger.info("Entity {} doesn't have a maintenance reason entry, exit request ignored", triggeringEntity); + } + } + } else { + // No maintenance signal exists + if (triggeringEntity == MaintenanceSignal.TriggeringEntity.USER) { + logger.info("USER administrative override: no maintenance signal exists, nothing to remove"); + } else { + // CONTROLLER/AUTOMATION: strict no-op + logger.info("Entity {} attempted to exit maintenance mode but no maintenance signal exists", triggeringEntity); } } } else { @@ -1244,9 +1267,9 @@ private void processMaintenanceMode(String clusterName, final boolean enabled, maintenanceSignal = new MaintenanceSignal(MAINTENANCE_ZNODE_ID); } - // First check for potential old client updates (simpleFields different than listField entries) - // This MUST happen before we modify any simpleFields to avoid overwriting important data needed for reconciliation - maintenanceSignal.reconcileMaintenanceData(); + // This is CRITICAL: Reconcile any legacy data BEFORE updating simpleFields + // This must happen before any simpleField updates to preserve legacy USER data + maintenanceSignal.reconcileLegacyData(); // Add the reason to the maintenance signal if (reason != null) { diff --git a/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java b/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java index 1ef3fe2aa2..b08cc2630e 100644 --- a/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java +++ b/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java @@ -211,18 +211,7 @@ public List> getMaintenanceReasons() { List> reasons = new ArrayList<>(); List reasonsList = _record.getListField(REASONS_LIST_FIELD); - if (reasonsList == null || reasonsList.isEmpty()) { - // If the list doesn't exist but simple fields do, add the simple fields as the first reason - String simpleReason = getReason(); - if (simpleReason != null) { - Map entry = new HashMap<>(); - entry.put(PauseSignalProperty.REASON.name(), simpleReason); - entry.put(MaintenanceSignalProperty.TIMESTAMP.name(), String.valueOf(getTimestamp())); - entry.put(MaintenanceSignalProperty.TRIGGERED_BY.name(), getTriggeringEntity().name()); - - reasons.add(entry); - } - } else { + if (reasonsList != null && !reasonsList.isEmpty()) { for (String entryStr : reasonsList) { Map entry = parseJsonStyleEntry(entryStr); if (!entry.isEmpty()) { @@ -257,10 +246,6 @@ private Map parseJsonStyleEntry(String entryStr) { public boolean removeMaintenanceReason(TriggeringEntity triggeringEntity) { LOG.info("Removing maintenance reason for entity: {}", triggeringEntity); - // First reconcile data to capture any simple field updates from old clients - // that might not have updated the reasons list - reconcileMaintenanceData(); - List> reasons = getMaintenanceReasons(); boolean entityExists = false; @@ -338,79 +323,6 @@ public boolean hasMaintenanceReasons() { return !getMaintenanceReasons().isEmpty(); } - /** - * Update the maintenance reason list to ensure all data is consistent. - * This is used when handling potential inconsistencies between simpleFields and - * the reasons list, which can happen with old clients. - * - * @return true if list was updated, false if no change was needed - */ - public boolean reconcileMaintenanceData() { - // Get the reason from simple fields - String simpleReason = getReason(); - - // If simpleFields are null or empty, nothing to reconcile - if (simpleReason == null) { - return false; - } - long simpleTimestamp = getTimestamp(); - TriggeringEntity simpleTriggeringEntity = getTriggeringEntity(); - - List rawReasonsList = _record.getListField(REASONS_LIST_FIELD); - List> parsedReasons = new ArrayList<>(); - - // If there's no list field at all, we'll need to create one - boolean needsUpdate = (rawReasonsList == null); - - if (rawReasonsList != null) { - for (String entryStr : rawReasonsList) { - Map entry = parseJsonStyleEntry(entryStr); - if (!entry.isEmpty()) { - parsedReasons.add(entry); - } - } - } - - // Check if the triggering entity from simple fields already has an entry - boolean alreadyPresent = false; - for (Map entry : parsedReasons) { - if (simpleTriggeringEntity.name().equals(entry.get(MaintenanceSignalProperty.TRIGGERED_BY.name()))) { - long entryTimestamp = Long.parseLong(entry.get(MaintenanceSignalProperty.TIMESTAMP.name())); - if (simpleTimestamp > entryTimestamp) { - // If simple field timestamp is newer, update the entry - entry.put(PauseSignalProperty.REASON.name(), simpleReason); - entry.put(MaintenanceSignalProperty.TIMESTAMP.name(), Long.toString(simpleTimestamp)); - needsUpdate = true; - } - alreadyPresent = true; - break; - } - } - - // If simple fields exist but not in the reasons list, add them - if (!alreadyPresent) { - Map newEntry = new HashMap<>(); - newEntry.put(PauseSignalProperty.REASON.name(), simpleReason); - newEntry.put(MaintenanceSignalProperty.TIMESTAMP.name(), Long.toString(simpleTimestamp)); - newEntry.put(MaintenanceSignalProperty.TRIGGERED_BY.name(), simpleTriggeringEntity.name()); - - parsedReasons.add(newEntry); - needsUpdate = true; - - LOG.debug("Added missing reason for {} to reasons list during reconciliation", - simpleTriggeringEntity); - } - - if (needsUpdate) { - updateReasonsListField(parsedReasons); - LOG.debug("Updated reasons list after reconciliation, now contains {} entries", - parsedReasons.size()); - return true; - } - - return false; - } - /** * Checks if there is a maintenance reason from a specific triggering entity. * @@ -462,4 +374,39 @@ public String getMaintenanceReason(TriggeringEntity triggeringEntity) { Map details = getMaintenanceReasonDetails(triggeringEntity); return details != null ? details.get(PauseSignalProperty.REASON.name()) : null; } + + /** + * Reconcile legacy data from simpleFields into listFields.reasons if it's missing. + * This preserves maintenance data written by old USER clients that only set simpleFields. + * + * NOTE: Only reconciles USER data, as: + * - CONTROLLER is part of core Helix system and should use proper APIs + * - AUTOMATION is new and has no legacy clients + * - Only USER entities represent external legacy clients that may wipe data + */ + public void reconcileLegacyData() { + // Check if simpleFields exist but corresponding listFields entry is missing + String simpleReason = getReason(); + TriggeringEntity simpleEntity = getTriggeringEntity(); + long simpleTimestamp = getTimestamp(); + + // Only reconcile USER data from legacy clients + // CONTROLLER and AUTOMATION should not have legacy data loss scenarios + if (simpleReason != null && !simpleReason.isEmpty() && simpleEntity == TriggeringEntity.USER + && simpleTimestamp > 0 && !hasMaintenanceReason(simpleEntity)) { + + // Legacy USER data exists but not in listFields - preserve it + Map legacyEntry = new HashMap<>(); + legacyEntry.put(PauseSignalProperty.REASON.name(), simpleReason); + legacyEntry.put(MaintenanceSignalProperty.TIMESTAMP.name(), String.valueOf(simpleTimestamp)); + legacyEntry.put(MaintenanceSignalProperty.TRIGGERED_BY.name(), simpleEntity.name()); + + List> reasons = getMaintenanceReasons(); + reasons.add(legacyEntry); + updateReasonsListField(reasons); + + LOG.info("Reconciled legacy USER maintenance data: reason={}, timestamp={}", + simpleReason, simpleTimestamp); + } + } } diff --git a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java index e3064a5e77..4288179799 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java +++ b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java @@ -21,7 +21,6 @@ import java.io.IOException; import java.util.HashMap; -import java.util.List; import java.util.Map; import javax.management.MalformedObjectNameException; import javax.management.ObjectName; @@ -444,553 +443,766 @@ private static Map convertStringToMap(String value) throws IOExc } /** - * Test that automation triggered maintenance mode works correctly - * and multi-actor maintenance mode requires all actors to exit + * Test basic multi-actor stacking behavior. + * Verifies core functionality: actor-based stacking, actor override, simpleFields most recent */ @Test public void testAutomationMaintenanceMode() throws Exception { - // Make sure we're not in maintenance mode at the start + ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); + _manager.getConfigAccessor().setClusterConfig(CLUSTER_NAME, clusterConfig); _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); - // Put cluster in maintenance mode via automation - Map customFields = ImmutableMap.of("TICKET", "AUTO-123", "SOURCE", "HelixACM"); - _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, - "Automation maintenance", customFields); + // Step 1: USER enters MM (t1) with reason_A + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, "reason_A", null); - // Verify we are in maintenance mode with the right attributes - MaintenanceSignal maintenanceSignal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertNotNull(maintenanceSignal); - Assert.assertEquals(maintenanceSignal.getTriggeringEntity(), - MaintenanceSignal.TriggeringEntity.AUTOMATION); + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + Assert.assertEquals(signal.getReason(), "reason_A"); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), "reason_A"); - // Verify custom fields were set - for (Map.Entry entry : customFields.entrySet()) { - Assert.assertEquals(maintenanceSignal.getRecord().getSimpleField(entry.getKey()), - entry.getValue()); - } + Thread.sleep(10); // Ensure different timestamps - // Manually put the cluster in maintenance too - this should keep both reasons + // Step 2: AUTOMATION enters MM (t2) with reason_B - should stack with USER + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, "reason_B", null); + + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.AUTOMATION); // Most recent + Assert.assertEquals(signal.getReason(), "reason_B"); // Most recent + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), "reason_A"); + Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION), "reason_B"); + + Thread.sleep(10); + + // Step 3: USER enters MM again (t3) with reason_C - should override previous USER entry _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, - "User maintenance", null); + "reason_C", null); - // Verify we have both reasons - maintenanceSignal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertTrue(maintenanceSignal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); - Assert.assertTrue(maintenanceSignal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); - Assert.assertEquals(maintenanceSignal.getMaintenanceReasonsCount(), 2); + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); // Still only 2 (USER overrode itself) + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); // Most recent + Assert.assertEquals(signal.getReason(), "reason_C"); // Most recent + Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), + "reason_C"); // Updated + Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION), + "reason_B"); // Unchanged - // Exit maintenance mode for USER only - _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + Thread.sleep(10); - // Verify we're still in maintenance mode with only AUTOMATION reason - maintenanceSignal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertNotNull(maintenanceSignal); - Assert.assertFalse(maintenanceSignal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); - Assert.assertTrue(maintenanceSignal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); - Assert.assertEquals(maintenanceSignal.getMaintenanceReasonsCount(), 1); + // Step 4: AUTOMATION enters MM again (t4) with reason_D - should override previous AUTOMATION entry + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "reason_D", null); - // Exit maintenance mode for AUTOMATION - _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); // Still only 2 (AUTOMATION overrode itself) + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.AUTOMATION); // Most recent + Assert.assertEquals(signal.getReason(), "reason_D"); // Most recent + Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), + "reason_C"); // Unchanged + Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION), + "reason_D"); // Updated + + // Clean exit sequence: actors exit in order + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); - // Verify we're now out of maintenance mode + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + Assert.assertEquals(signal.getReason(), "reason_D"); // Updated to remaining reason + + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, + null); TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); } /** - * Test that removing a maintenance reason doesn't add duplicate entries in the reasons list + * USER administrative override after old client data loss + * 1. Multi-actor setup with CONTROLLER, USER, AUTOMATION + * 2. Old client wipes listField data (keeps only simpleFields) + * 3. AUTOMATION tries to exit MM (no-op since its data was wiped) + * 4. USER exits MM (administrative override - forces complete exit) */ @Test - public void testRemoveMaintenanceReasonNoDuplicates() throws Exception { - // Make sure we start clean - _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); - _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); - TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + public void testLegacyClientCompatibility() throws Exception { + ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); + clusterConfig.setMaxPartitionsPerInstance(-1); + clusterConfig.setNumOfflineInstancesForAutoExit(-1); // Disable auto-exit to prevent race conditions + _manager.getConfigAccessor().setClusterConfig(CLUSTER_NAME, clusterConfig); - // First put the cluster in maintenance mode via USER + // Wait for config to be applied + TestHelper.verify(() -> { + ClusterConfig currentConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); + return currentConfig.getNumOfflineInstancesForAutoExit() == -1; + }, 2000L); _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, - "User entry", null); - - // Then add AUTOMATION reason + "reason_A", null); + Thread.sleep(10); + _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode(CLUSTER_NAME, true, "reason_B", + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); + Thread.sleep(10); _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, - "Automation entry", null); + "reason_C", null); - // Verify we have both reasons + // Verify multi-actor setup MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 3); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); - // Remove AUTOMATION reason - _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + // Simulate old client wiping listField data (only keeps simpleFields) + Thread.sleep(10); + ZNRecord record = new ZNRecord("maintenance"); + record.setSimpleField(PauseSignal.PauseSignalProperty.REASON.name(), "reason_D"); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TIMESTAMP.name(), + String.valueOf(System.currentTimeMillis())); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TRIGGERED_BY.name(), + MaintenanceSignal.TriggeringEntity.USER.name()); + // Old client doesn't set listField - simulates wiping all listField data + _dataAccessor.setProperty(_keyBuilder.maintenance(), new MaintenanceSignal(record)); - // Verify we only have USER reason and no duplicate entries + // Verify old client wiped listField data but simpleFields remain signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); - Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); - Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 0, + "Old client should have wiped listField data"); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + Assert.assertEquals(signal.getReason(), "reason_D"); + Assert.assertFalse(signal.hasMaintenanceReasons(), + "Should have no listField reasons after old client wipe"); - // Verify the reasons list field to ensure no duplicates - List reasonsList = signal.getRecord().getListField("reasons"); - Assert.assertEquals(reasonsList.size(), 1, "Should have exactly 1 entry in reasons list"); + // AUTOMATION tries to exit MM -> should be no-op because its entry was wiped + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); - // Verify the simple fields are correct + // Verify maintenance signal remains the same (no-op) + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal, "Should still be in maintenance after AUTOMATION no-op"); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 0); + Assert.assertEquals(signal.getReason(), "reason_D"); Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); - Assert.assertEquals(signal.getReason(), "User entry"); - // Remove USER reason - _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + // USER tries to exit MM -> should trigger administrative override and delete maintenance ZNode + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, + null); - // Verify we're out of maintenance mode + // Verify we're completely out of maintenance mode due to USER administrative override TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); } /** - * Test that the code can handle an old client that writes a MaintenanceSignal - * without using the multi-actor maintenance API (no reasons list). + * Helper method to set up a common scenario for maintenance mode tests: + * 1. USER enters maintenance mode + * 2. AUTOMATION enters maintenance mode + * 3. User B enters maintenance mode (overrides User A) + * 4. Old client enters maintenance mode (simple fields only - wipes listField data) */ - @Test - public void testLegacyClientCompatibility() throws Exception { - // Simulate an old client by creating a MaintenanceSignal with only simple fields + private void setupMultiActorMaintenanceScenario() throws Exception { + ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); + clusterConfig.setMaxPartitionsPerInstance(-1); + clusterConfig.setNumOfflineInstancesForAutoExit(-1); // Disable auto-exit to prevent race conditions + _manager.getConfigAccessor().setClusterConfig(CLUSTER_NAME, clusterConfig); + // Step 0: CONTROLLER puts the cluster into MM (t1) + _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode(CLUSTER_NAME, true, + "reason_Controller", + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); + + // Verify maintenance signal with USER reason only + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); + + // Step 1: USER (UserA) puts the cluster into MM (t1) + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "reason_A", null); + + // Verify maintenance signal with USER reason only + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + + // Step 2: AUTOMATION puts the cluster into MM (t2) + Thread.sleep(10); // Ensure different timestamps + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "reason_B", null); + + // Verify maintenance signal has both reasons + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 3); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + + // Step 3: USER (UserB) puts the cluster into MM (t3) - overrides UserA's entry + Thread.sleep(10); + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "reason_C", null); + + // Verify maintenance signal still has same number of reasons but UserB's reason replaced UserA's + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 3); + Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), "reason_C"); + + // Step 4: USER (Old Client) enters cluster into MM (t4) + // Simulate old client by directly creating a MaintenanceSignal with simple fields only + // Per design doc: "Legacy clients use the dataAccessor.set() API to create Maintenance signals, + // which results in the entire ZNRecord being overwritten, including purging all existing ListField entries" + Thread.sleep(10); ZNRecord record = new ZNRecord("maintenance"); - // Add just the simple fields like an old client would - record.setSimpleField(PauseSignal.PauseSignalProperty.REASON.name(), "Legacy client reason"); - record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TRIGGERED_BY.name(), - MaintenanceSignal.TriggeringEntity.USER.name()); + record.setSimpleField(PauseSignal.PauseSignalProperty.REASON.name(), "reason_D"); record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TIMESTAMP.name(), String.valueOf(System.currentTimeMillis())); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TRIGGERED_BY.name(), + MaintenanceSignal.TriggeringEntity.USER.name()); - // Write it directly to ZK + // Use setProperty (not updateProperty) to simulate old client completely overwriting the ZNode _dataAccessor.setProperty(_keyBuilder.maintenance(), new MaintenanceSignal(record)); - // Verify the maintenance signal is there + // Verify maintenance signal has updated simple fields but listField data was wiped by old client + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + Assert.assertEquals(signal.getReason(), "reason_D"); + + // Verify reasons list was wiped by old client (data loss accepted by design) + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 0, + "Old client should have wiped all listField data"); + } + + /** + * Test Case A: USER administrative override after old client data loss + * 1. After old client wipes data, verify no listField reasons exist + * 2. USER tries to exit MM - should trigger administrative override and delete maintenance ZNode + * 3. Verify maintenance mode is completely exited + */ + @Test + public void testUserAdministrativeOverride() throws Exception { + // Set up the initial state with all actors having entered maintenance mode + // Note: setupMultiActorMaintenanceScenario() ends with old client wiping listField data + setupMultiActorMaintenanceScenario(); + + // Verify the old client has wiped the listField data MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 0, + "Old client should have wiped listField data"); Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); - Assert.assertEquals(signal.getReason(), "Legacy client reason"); + Assert.assertEquals(signal.getReason(), "reason_D"); - // Verify that the list field does not exist yet - List reasonsListBefore = signal.getRecord().getListField("reasons"); - Assert.assertNull(reasonsListBefore, "Should not have reasons list field yet"); + // Step 6A: USER (New client) tries to exit MM + // Since USER doesn't have an entry in listFields.reasons, this should trigger administrative override + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); - // Explicitly call reconcileMaintenanceData - boolean updated = signal.reconcileMaintenanceData(); - Assert.assertTrue(updated, "Should have updated the ZNode during reconciliation"); + // Verify we're completely out of maintenance mode due to USER administrative override + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); - // Verify the reasons list was created - List reasonsListAfter = signal.getRecord().getListField("reasons"); - Assert.assertNotNull(reasonsListAfter, "Should have created reasons list field"); - Assert.assertEquals(reasonsListAfter.size(), 1, "Should have added exactly one entry"); + // Verify in history that we're no longer in maintenance + ControllerHistory history = _dataAccessor.getProperty(_keyBuilder.controllerLeaderHistory()); + Map lastEntry = convertStringToMap( + history.getMaintenanceHistoryList().get(history.getMaintenanceHistoryList().size() - 1)); + Assert.assertEquals(lastEntry.get("OPERATION_TYPE"), "EXIT"); + Assert.assertEquals(lastEntry.get("TRIGGERED_BY"), "USER"); + Assert.assertEquals(lastEntry.get("IN_MAINTENANCE_AFTER_OPERATION"), "false"); + } - // Verify the entry content contains all simple fields - String entryString = reasonsListAfter.get(0); - Assert.assertTrue(entryString.contains("Legacy client reason"), - "Entry should contain the reason text"); - Assert.assertTrue(entryString.contains(MaintenanceSignal.TriggeringEntity.USER.name()), - "Entry should contain the entity"); + /** + * Old client enters, new clients operate independently (no reconciliation) + * 1. Old client enters maintenance mode without updating the reasons list (data wiped) + * 2. New client enters maintenance mode and operates independently + * 3. All actors exit in sequence + */ + @Test + public void testOldClientDataLoss() throws Exception { + // Set up the initial state with all actors having entered maintenance mode + setupMultiActorMaintenanceScenario(); + + // At this point, the old client in setupMultiActorMaintenanceScenario has wiped the listField data + // Verify that the old client action resulted in data loss + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - // Save to ZK - _dataAccessor.setProperty(_keyBuilder.maintenance(), signal); + // simpleFields should show USER data (from old client) + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + Assert.assertEquals(signal.getReason(), "reason_D"); - // Now read it back to verify persistence - MaintenanceSignal readBackSignal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - List persistedReasonsList = readBackSignal.getRecord().getListField("reasons"); - Assert.assertEquals(persistedReasonsList.size(), 1, "Should have persisted exactly one entry"); + // But listFields.reasons should be empty (wiped by old client) + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 0, + "Old client should have wiped listField data"); + Assert.assertFalse(signal.hasMaintenanceReasons(), + "Should not have maintenance reasons after old client wipe"); - // Now try to add a new reason via the new API - should work with the legacy format + // Step 6B: AUTOMATION enters MM again - should work independently (no reconciliation) _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, - "Automation entry with legacy client", null); + "reason_F", null); - // Verify both reasons exist + // Verify signal now has only AUTOMATION reason (no reconciliation of old USER data) signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); - Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER), + "Controller data was lost"); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), + "User data was lost"); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); - // Try removing the automation reason - _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + // Verify the new AUTOMATION reason + Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION), + "reason_F"); - // Verify only USER reason remains - signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); - Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + // Exit the only remaining actor + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, + null); - // Clean up - _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + // Verify we're out of maintenance mode TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); } /** - * Test that the IN_MAINTENANCE_AFTER_OPERATION field in the history record - * is only set to false when all maintenance reasons are gone. + * Test AUTOMATION re-entry after old client wipes data + * Old client wipes data, AUTOMATION re-enters independently, + * then sequence of exits with no-ops and administrative override */ - @Test(dependsOnMethods = "testLegacyClientCompatibility") - public void testMaintenanceHistoryAfterOperationFlag() throws Exception { - // Make sure we start clean - _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); - _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); - TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); - - // First put the cluster in maintenance mode via USER - _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, - "User entry2", null); - - // Verify history shows IN_MAINTENANCE_AFTER_OPERATION as true - ControllerHistory history = _dataAccessor.getProperty(_keyBuilder.controllerLeaderHistory()); - Map lastEntry = convertStringToMap( - history.getMaintenanceHistoryList().get(history.getMaintenanceHistoryList().size() - 1)); - Assert.assertEquals(lastEntry.get("OPERATION_TYPE"), "ENTER"); - Assert.assertEquals(lastEntry.get("TRIGGERED_BY"), "USER"); - Assert.assertEquals(lastEntry.get("IN_MAINTENANCE_AFTER_OPERATION"), "true"); + @Test + public void testAutomationReentryAfterDataLoss() throws Exception { + // Setup the multi-actor scenario which ends with old client wiping data + setupMultiActorMaintenanceScenario(); - // Add a second actor (AUTOMATION) + // Case B Step 6: AUTOMATION enters MM again (works independently, no reconciliation) _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, - "Automation entry2", null); - - // Verify history shows IN_MAINTENANCE_AFTER_OPERATION as true - history = _dataAccessor.getProperty(_keyBuilder.controllerLeaderHistory()); - lastEntry = convertStringToMap( - history.getMaintenanceHistoryList().get(history.getMaintenanceHistoryList().size() - 1)); - Assert.assertEquals(lastEntry.get("OPERATION_TYPE"), "ENTER"); - Assert.assertEquals(lastEntry.get("TRIGGERED_BY"), "AUTOMATION"); - Assert.assertEquals(lastEntry.get("IN_MAINTENANCE_AFTER_OPERATION"), "true"); + "reason_E", null); - // Remove AUTOMATION actor, but we should still be in maintenance mode because USER remains - _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, - null, null); - - // Verify we're still in maintenance mode with a single actor + // Verify signal now has only AUTOMATION reason (no reconciliation of wiped data) MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertNotNull(signal); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION), + "reason_E"); - // Verify history shows IN_MAINTENANCE_AFTER_OPERATION as true - history = _dataAccessor.getProperty(_keyBuilder.controllerLeaderHistory()); - lastEntry = convertStringToMap( - history.getMaintenanceHistoryList().get(history.getMaintenanceHistoryList().size() - 1)); - Assert.assertEquals(lastEntry.get("OPERATION_TYPE"), "EXIT"); - Assert.assertEquals(lastEntry.get("TRIGGERED_BY"), "AUTOMATION"); - Assert.assertEquals(lastEntry.get("IN_MAINTENANCE_AFTER_OPERATION"), "true"); - - // Remove USER actor, which should get us out of maintenance mode - _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, - null, null); + // Case B Step 7: USER exits MM -> should be administrative override since USER has no listField entry + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, + null); - // Verify we're out of maintenance mode + // Verify we're completely out of maintenance mode due to USER administrative override TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); - - // Verify history shows IN_MAINTENANCE_AFTER_OPERATION as false - history = _dataAccessor.getProperty(_keyBuilder.controllerLeaderHistory()); - lastEntry = convertStringToMap( - history.getMaintenanceHistoryList().get(history.getMaintenanceHistoryList().size() - 1)); - Assert.assertEquals(lastEntry.get("OPERATION_TYPE"), "EXIT"); - Assert.assertEquals(lastEntry.get("TRIGGERED_BY"), "USER"); - Assert.assertEquals(lastEntry.get("IN_MAINTENANCE_AFTER_OPERATION"), "false"); } /** - * Set up the initial state and mock components for maintenance mode tests. - * This ensures maintenance mode doesn't get automatically exited. + * Test old client force exit by deleting entire maintenance znode */ - private void setupMaintenanceModeTest() throws Exception { - // Set cluster config to ensure auto-exit conditions are never met - ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); - clusterConfig.setMaxOfflineInstancesAllowed(2); - clusterConfig.setNumOfflineInstancesForAutoExit(1); - _manager.getConfigAccessor().setClusterConfig(CLUSTER_NAME, clusterConfig); + @Test + public void testOldClientDeletesEntireZnode() throws Exception { + // Setup: Multiple actors enter maintenance mode + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "user_reason", null); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "automation_reason", null); - // Kill 3 instances - for (int i = 0; i < 3; i++) { - _participants[i].syncStop(); - } - TestHelper.verify(() -> _dataAccessor.getChildNames(_keyBuilder.liveInstances()).isEmpty(), 2000L); + // Verify we have both actors + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); - // Check that the cluster is in maintenance - MaintenanceSignal maintenanceSignal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertNotNull(maintenanceSignal); + // Case D: USER (old client) exits MM by deleting entire znode + _dataAccessor.removeProperty(_keyBuilder.maintenance()); + + // Verify we're completely out of maintenance mode (all data lost) + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); } /** - * Helper method to set up a common scenario for maintenance mode tests: - * 1. Controller enters maintenance mode - * 2. User A enters maintenance mode - * 3. User B enters maintenance mode (overrides User A) - * 4. Automation enters maintenance mode - * 5. Old client enters maintenance mode (simple fields only) - * + * Test that simpleFields always reflect the most recently added reason */ - private void setupMultiActorMaintenanceScenario() throws Exception { - // Set up the maintenance mode test environment - setupMaintenanceModeTest(); + @Test + public void testSimpleFieldsReflectMostRecent() throws Exception { + // Entry 1: USER at t1 + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "user_first", null); - // Verify maintenance signal with CONTROLLER reason only MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertNotNull(signal); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); - Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); + Assert.assertEquals(signal.getReason(), "user_first"); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); - // Step 2: USER (UserA) puts the cluster into MM (t2) - Thread.sleep(10); // Ensure different timestamps - _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, "reason_B", null); + Thread.sleep(10); + + // Entry 2: AUTOMATION at t2 (should become most recent) + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "automation_second", null); - // Verify maintenance signal has both reasons signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); - Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); - Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertEquals(signal.getReason(), "automation_second"); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.AUTOMATION); - // Step 3: USER (UserB) puts the cluster into MM (t3) - overrides UserA's entry Thread.sleep(10); - _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, "reason_C", null); - // Verify maintenance signal still has same number of reasons but UserB's reason replaced UserA's + // Entry 3: USER at t3 (should become most recent) + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "user_third", null); + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); - Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), "reason_C"); + Assert.assertEquals(signal.getReason(), "user_third"); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); - // Step 4: AUTOMATION (HelixACM) puts the cluster into MM (t4) Thread.sleep(10); - _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, "reason_D", null); - // Verify maintenance signal has all three reasons + // Entry 4: AUTOMATION again at t4 (should become most recent, overriding its previous entry) + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "automation_fourth", null); + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 3); - Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); - Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); - Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + Assert.assertEquals(signal.getReason(), "automation_fourth"); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.AUTOMATION); + // Should still have 2 actors (AUTOMATION entry was overwritten) + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); - // Step 5: USER (Old Client) enters cluster into MM (t5) - // Simulate old client by directly creating a MaintenanceSignal with simple fields only - Thread.sleep(10); + // Clean up + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + } + + /** + * Test edge cases around empty maintenance mode + */ + @Test + public void testEmptyStateEdgeCases() throws Exception { + // Test 1: Try to exit when no maintenance mode exists + + // AUTOMATION tries to exit when no MM exists -> should be no-op + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + Assert.assertNull(_dataAccessor.getProperty(_keyBuilder.maintenance())); + + // USER tries to exit when no MM exists -> should be no-op (nothing to override) + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + Assert.assertNull(_dataAccessor.getProperty(_keyBuilder.maintenance())); + + // Test 2: Create maintenance mode with only simpleFields (old client style) ZNRecord record = new ZNRecord("maintenance"); - record.setSimpleField(PauseSignal.PauseSignalProperty.REASON.name(), "reason_E"); - record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TIMESTAMP.name(), - String.valueOf(System.currentTimeMillis())); + record.setSimpleField(PauseSignal.PauseSignalProperty.REASON.name(), "Old client reason"); record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TRIGGERED_BY.name(), MaintenanceSignal.TriggeringEntity.USER.name()); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TIMESTAMP.name(), + String.valueOf(System.currentTimeMillis())); + _dataAccessor.setProperty(_keyBuilder.maintenance(), new MaintenanceSignal(record)); - // Write directly to ZK - _dataAccessor.updateProperty(_keyBuilder.maintenance(), new MaintenanceSignal(record)); + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 0, "Should have no listField reasons"); + Assert.assertFalse(signal.hasMaintenanceReasons(), "Should report no maintenance reasons"); - // Verify maintenance signal has updated simple fields but listField still has old USER entry - signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); - Assert.assertEquals(signal.getReason(), "reason_E"); + // AUTOMATION tries to exit -> should be no-op + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + Assert.assertNotNull(_dataAccessor.getProperty(_keyBuilder.maintenance()), + "Should still exist after no-op"); - // Verify reasons list still has original 3 entries (not updated by old client) - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 3); + // USER tries to exit -> should be administrative override + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); } /** - * Test Case A: New clients interaction where each actor enters and exits properly - * 1. Each actor (User, Automation, Controller) enters maintenance mode - * 2. Each actor exits maintenance mode in sequence - * 3. Verify maintenance flags in history records + * Test mixed entry/exit scenarios to stress test maintenance mode stacking + * Verifies complex sequences of actors entering and exiting in different orders */ @Test - public void testMultiActorMaintenanceModeExitSequence() throws Exception { - // Set up the initial state with all actors having entered maintenance mode - setupMultiActorMaintenanceScenario(); - - // Step 6A: USER (New client) exits MM - _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + public void testMixedEntryExitScenarios() throws Exception { + // Phase 1: Multiple actors enter + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "user_reason_1", null); + Thread.sleep(10); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "automation_reason_1", null); - // Verify USER reason is gone, but CONTROLLER and AUTOMATION remain MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertNotNull(signal); Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); - Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); - Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); - Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertEquals(signal.getReason(), "automation_reason_1"); // Most recent - // Verify in history that we're still in maintenance - ControllerHistory history = _dataAccessor.getProperty(_keyBuilder.controllerLeaderHistory()); - Map lastEntry = convertStringToMap( - history.getMaintenanceHistoryList().get(history.getMaintenanceHistoryList().size() - 1)); - Assert.assertEquals(lastEntry.get("OPERATION_TYPE"), "EXIT"); - Assert.assertEquals(lastEntry.get("TRIGGERED_BY"), "USER"); - Assert.assertEquals(lastEntry.get("IN_MAINTENANCE_AFTER_OPERATION"), "true"); - - // Step 7A: AUTOMATION exits MM - _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + // Phase 2: One actor exits, then re-enters with different reason + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); // USER exits + Thread.sleep(10); - // Verify AUTOMATION reason is gone, only CONTROLLER remains signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertNotNull(signal); Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); - Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); - Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); - // Verify in history that we're still in maintenance - history = _dataAccessor.getProperty(_keyBuilder.controllerLeaderHistory()); - lastEntry = convertStringToMap( - history.getMaintenanceHistoryList().get(history.getMaintenanceHistoryList().size() - 1)); - Assert.assertEquals(lastEntry.get("OPERATION_TYPE"), "EXIT"); - Assert.assertEquals(lastEntry.get("TRIGGERED_BY"), "AUTOMATION"); - Assert.assertEquals(lastEntry.get("IN_MAINTENANCE_AFTER_OPERATION"), "true"); + // USER re-enters with new reason + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "user_reason_2", null); - // Step 8A: CONTROLLER exits MM - _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode( - CLUSTER_NAME, false, null, MaintenanceSignal.AutoTriggerReason.NOT_APPLICABLE); + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertEquals(signal.getReason(), "user_reason_2"); // Most recent - // Verify we're out of maintenance mode + // Phase 3: Clean exit + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); - - // Verify in history that we're no longer in maintenance - history = _dataAccessor.getProperty(_keyBuilder.controllerLeaderHistory()); - lastEntry = convertStringToMap( - history.getMaintenanceHistoryList().get(history.getMaintenanceHistoryList().size() - 1)); - Assert.assertEquals(lastEntry.get("OPERATION_TYPE"), "EXIT"); - Assert.assertEquals(lastEntry.get("TRIGGERED_BY"), "CONTROLLER"); - Assert.assertEquals(lastEntry.get("IN_MAINTENANCE_AFTER_OPERATION"), "false"); } /** - * Test Case B: Old client enters, new clients reconcile during future operations - * 1. Old client enters maintenance mode without updating the reasons list - * 2. New client enters maintenance mode and reconciles the list - * 3. All actors exit in sequence + * Test basic multi-actor stacking behavior including CONTROLLER entity + * Creates actual conditions that trigger CONTROLLER maintenance mode */ @Test - public void testMultiActorMaintenanceModeReconciliation() throws Exception { - // Set up the initial state with all actors having entered maintenance mode - setupMultiActorMaintenanceScenario(); - - // Step 6B: AUTOMATION enters MM again - should reconcile the old client's USER update - _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, "reason_F", null); + public void testMultiActorStackingWithController() throws Exception { + ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); + clusterConfig.setMaxPartitionsPerInstance(-1); + _manager.getConfigAccessor().setClusterConfig(CLUSTER_NAME, clusterConfig); + // Step 1: Directly trigger CONTROLLER maintenance mode using API + _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode(CLUSTER_NAME, true, + "Test controller maintenance", + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); - // Verify signal has reconciled the old client's USER update with the timestamp + // Verify CONTROLLER entered maintenance mode automatically MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); + + // Step 2: USER enters MM - should stack with CONTROLLER + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "user_reason", null); + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + Assert.assertEquals(signal.getReason(), "user_reason"); // Most recent + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + + // Step 3: AUTOMATION enters MM - should stack with both + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "automation_reason", null); + + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertEquals(signal.getMaintenanceReasonsCount(), 3); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.AUTOMATION); + Assert.assertEquals(signal.getReason(), "automation_reason"); // Most recent Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); - // Verify reason is now "reason_E" - Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), "reason_E"); + // Step 4: USER exits - should remain in maintenance with CONTROLLER and AUTOMATION + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); - // Exit all actors in sequence - _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode( - CLUSTER_NAME, false, null, MaintenanceSignal.AutoTriggerReason.NOT_APPLICABLE); - _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); - _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); + Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); - // Verify we're out of maintenance mode - TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); - } + // Step 5: AUTOMATION exits - should remain in maintenance with only CONTROLLER + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); - /** - * Test Case C: Old client exits maintenance mode while other actors still have reasons - * 1. Old client bypasses the normal API and just deletes the maintenance node - * 2. Verify the cluster exits maintenance mode completely - */ - @Test - public void testMultiActorMaintenanceModeOldClientExit() throws Exception { - // Set up the initial state with all actors having entered maintenance mode - setupMultiActorMaintenanceScenario(); + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); + Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); - // Step 6C: USER (old client) exits MM - simulating old client removing entire node - _dataAccessor.removeProperty(_keyBuilder.maintenance()); + // Step 6: Exit CONTROLLER maintenance mode + _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode(CLUSTER_NAME, false, null, + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); - // Verify we're out of maintenance mode completely + // Verify maintenance mode is completely off TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); } /** - * Old client overrides simple fields after new clients enter MM, - * then new client exits but should not exit maintenance mode completely - * 1. AUTOMATION enters MM (new client) - * 2. USER enters MM again (old client - only updates simple fields) - * 3. AUTOMATION exits MM - * 4. Verify we're still in maintenance mode because USER reason remains + * Test old client wipes data including CONTROLLER entry + * Verifies CONTROLLER no-op behavior after data loss */ @Test - public void testMultiActorMaintenanceModeOldClientOverride() throws Exception { - // Step 1: AUTOMATION enters MM (t2) - _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, - "AUTOMATION reason", null); + public void testControllerNoOpAfterOldClientWipe() throws Exception { + ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); + clusterConfig.setMaxPartitionsPerInstance(-1); + clusterConfig.setNumOfflineInstancesForAutoExit(-1); // Disable auto-exit to prevent race conditions + _manager.getConfigAccessor().setClusterConfig(CLUSTER_NAME, clusterConfig); + // Step 1: Directly trigger CONTROLLER maintenance mode using API + _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode(CLUSTER_NAME, true, + "Test controller maintenance", + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); - // Verify maintenance signal has only AUTOMATION reason. + // Verify CONTROLLER entered maintenance mode MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); - Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); + + // Add USER and AUTOMATION to create multi-actor scenario + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "user_reason", null); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "automation_reason", null); + + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 3); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); - // Step 2: USER enters MM (t3) via old client (only updates simple fields) - Thread.sleep(10); - long t3 = System.currentTimeMillis(); - // Simulate an old client by creating a MaintenanceSignal with only simple fields - ZNRecord record = new ZNRecord(signal.getRecord()); - record.setSimpleField(PauseSignal.PauseSignalProperty.REASON.name(), "USER old client reason"); - record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TIMESTAMP.name(), String.valueOf(t3)); + // Simulate old client wiping all data + ZNRecord record = new ZNRecord("maintenance"); + record.setSimpleField(PauseSignal.PauseSignalProperty.REASON.name(), "Old client reason"); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TIMESTAMP.name(), + String.valueOf(System.currentTimeMillis())); record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TRIGGERED_BY.name(), MaintenanceSignal.TriggeringEntity.USER.name()); - // Don't update the listField - simulate old client behavior - - // Write it directly to ZK + // Old client doesn't set listField - simulates wiping all listField data _dataAccessor.setProperty(_keyBuilder.maintenance(), new MaintenanceSignal(record)); - // Verify the simple fields were updated but the reasons list still has old USER entry + // Verify old client wiped listField data signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 0, + "Old client should have wiped listField data"); Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); - Assert.assertEquals(signal.getReason(), "USER old client reason"); - // MaintenanceReasonsCount should still be 2 because the old client didn't update the reasons list - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + // Try CONTROLLER exit - should be no-op since its entry was wiped + _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode(CLUSTER_NAME, false, null, + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); - // Step 3: AUTOMATION exits MM - _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + // Verify maintenance signal remains the same (no-op) + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal, "Should still be in maintenance after CONTROLLER no-op"); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 0); + Assert.assertEquals(signal.getReason(), "Old client reason"); + + // Try AUTOMATION exit - should also be no-op since its entry was wiped + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); - // Verify we're still in maintenance mode because USER reason remains + // Verify maintenance signal remains the same (no-op) signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertNotNull(signal); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); - Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); - Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + Assert.assertNotNull(signal, "Should still be in maintenance after AUTOMATION no-op"); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 0); + Assert.assertEquals(signal.getReason(), "Old client reason"); - // Check that simple fields were updated to USER values - Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); - Assert.assertEquals(signal.getReason(), "USER old client reason"); + // USER tries to exit -> should be administrative override + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); - // Clean up - _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + // Verify completely out of maintenance mode TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); } /** - * Entity tries to exit MM without having entered it - * 1. AUTOMATION enters MM - * 2. USER tries to exit MM (shouldn't affect MM state) - * 3. Verify we're still in maintenance mode because USER wasn't an actor - * 4. AUTOMATION exits MM - * 5. Verify we're out of maintenance mode + * Test CONTROLLER override behavior - same entity overwrites previous entry */ @Test - public void testMultiActorMaintenanceModeInvalidExit() throws Exception { - // Step 1: AUTOMATION enters MM - _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, "AUTOMATION reason", null); + public void testControllerOverrideBehavior() throws Exception { + ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); + clusterConfig.setMaxPartitionsPerInstance(-1); + _manager.getConfigAccessor().setClusterConfig(CLUSTER_NAME, clusterConfig); + // Step 1: Directly trigger CONTROLLER maintenance mode with specific auto-trigger reason + _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode(CLUSTER_NAME, true, + "Initial controller reason", + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); - // Verify maintenance signal with AUTOMATION reason + // Verify CONTROLLER entered with auto-trigger reason MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(signal); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); + Assert.assertEquals(signal.getAutoTriggerReason(), + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); - Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); - // Step 2: USER tries to exit MM (shouldn't affect MM state) - _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + // Manually trigger CONTROLLER entry again with different reason (to test override) + _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode(CLUSTER_NAME, true, + "manual_controller_reason", + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); - // Step 3: Verify we're still in maintenance mode signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertNotNull(signal); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); - Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); + Assert.assertEquals(signal.getAutoTriggerReason(), + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), + 1); // Should still be only 1 (CONTROLLER overrode itself) - // Step 4: AUTOMATION exits MM - _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + // Add another actor to verify stacking still works + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, + "user_reason", null); - // Step 5: Verify we're out of maintenance mode + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + + // Cleanup + _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode(CLUSTER_NAME, false, + null, + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, + null, null); TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); } + + /** + * Test reconciliation of legacy USER data when new client adds reason + * Verifies the critical design requirement to preserve old client data + */ + @Test + public void testReconciliationOfLegacyUserData() throws Exception { + // Step 1: Old client sets simpleFields only (no listFields) + ZNRecord record = new ZNRecord("maintenance"); + record.setSimpleField(PauseSignal.PauseSignalProperty.REASON.name(), "legacy_user_reason"); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TRIGGERED_BY.name(), + MaintenanceSignal.TriggeringEntity.USER.name()); + record.setSimpleField(MaintenanceSignal.MaintenanceSignalProperty.TIMESTAMP.name(), + String.valueOf(System.currentTimeMillis())); + _dataAccessor.setProperty(_keyBuilder.maintenance(), new MaintenanceSignal(record)); + + // Verify old client state (no listFields reasons) + MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertNotNull(signal); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 0, + "Old client should have no listField data"); + Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + Assert.assertEquals(signal.getReason(), "legacy_user_reason"); + + // Step 2: New client adds AUTOMATION reason - should trigger reconciliation + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, + "automation_reason", null); + + // Step 3: Verify both reasons are preserved after reconciliation + signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); + Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2, + "Should have both reconciled USER and new AUTOMATION reasons"); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), + "USER reason should be preserved"); + Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION), + "AUTOMATION reason should be added"); + Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), + "legacy_user_reason"); + Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION), + "automation_reason"); + + // Cleanup + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, + false, null, null); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, + false, null, null); + TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, + 2000L); + } } From 4f829df0c699a86c73524ea7bba6e7790d558f4c Mon Sep 17 00:00:00 2001 From: LZD-PratyushBhatt Date: Wed, 16 Jul 2025 06:47:27 +0530 Subject: [PATCH 09/10] Address review comments --- .../apache/helix/manager/zk/ZKHelixAdmin.java | 22 +- .../apache/helix/model/MaintenanceSignal.java | 222 +++++++----------- .../TestClusterMaintenanceMode.java | 116 +++++---- 3 files changed, 169 insertions(+), 191 deletions(-) diff --git a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java index 98d1ea6e9c..f4b2b639e3 100644 --- a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java +++ b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java @@ -1224,12 +1224,23 @@ private void processMaintenanceMode(String clusterName, final boolean enabled, if (!enabled) { // Exit maintenance mode for this specific triggering entity - if (maintenanceSignal != null) { + // Early return if no maintenance signal exists + if (maintenanceSignal == null) { + if (triggeringEntity == MaintenanceSignal.TriggeringEntity.USER) { + logger.info("USER administrative override: no maintenance signal exists, nothing to remove"); + } else { + // CONTROLLER/AUTOMATION: strict no-op + logger.info("Entity {} attempted to exit maintenance mode but no maintenance signal exists", triggeringEntity); + } + return; + } + // If a specific actor is exiting maintenance mode boolean removed = maintenanceSignal.removeMaintenanceReason(triggeringEntity); if (removed) { // If there are still reasons for maintenance mode, update the ZNode + if (maintenanceSignal.getRecord().getListField("reasons") != null && !maintenanceSignal.getRecord().getListField("reasons").isEmpty()) { if (!accessor.setProperty(keyBuilder.maintenance(), maintenanceSignal)) { @@ -1249,15 +1260,6 @@ private void processMaintenanceMode(String clusterName, final boolean enabled, } else { // CONTROLLER/AUTOMATION: strict no-op if their entry not found logger.info("Entity {} doesn't have a maintenance reason entry, exit request ignored", triggeringEntity); - } - } - } else { - // No maintenance signal exists - if (triggeringEntity == MaintenanceSignal.TriggeringEntity.USER) { - logger.info("USER administrative override: no maintenance signal exists, nothing to remove"); - } else { - // CONTROLLER/AUTOMATION: strict no-op - logger.info("Entity {} attempted to exit maintenance mode but no maintenance signal exists", triggeringEntity); } } } else { diff --git a/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java b/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java index b08cc2630e..e286b2ec74 100644 --- a/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java +++ b/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java @@ -20,6 +20,7 @@ */ import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -131,7 +132,7 @@ public long getTimestamp() { } /** - * Add a new maintenance reason (or update an existing one if the triggering entity already has a reason). + * Add a new maintenance reason. If the triggering entity already has a reason, it will be replaced. * * @param reason The reason for maintenance * @param timestamp The timestamp when maintenance was triggered @@ -141,34 +142,22 @@ public void addMaintenanceReason(String reason, long timestamp, TriggeringEntity LOG.info("Adding maintenance reason for entity: {}, reason: {}, timestamp: {}", triggeringEntity, reason, timestamp); - // The triggering entity is our unique key - Overwrite any existing entry with this entity - String triggerEntityStr = triggeringEntity.name(); - List> reasons = getMaintenanceReasons(); LOG.debug("Before addition: Reasons list contains {} entries", reasons.size()); - boolean found = false; - for (Map entry : reasons) { - if (triggerEntityStr.equals(entry.get(MaintenanceSignalProperty.TRIGGERED_BY.name()))) { - entry.put(PauseSignalProperty.REASON.name(), reason); - entry.put(MaintenanceSignalProperty.TIMESTAMP.name(), Long.toString(timestamp)); - found = true; - LOG.debug("Updated existing entry for entity: {}", triggeringEntity); - break; - } - } + // Filter out any existing reasons for this triggering entity + List> filteredReasons = filterReasons(reasons, null, + Arrays.asList(triggeringEntity)); - if (!found) { - Map newEntry = new HashMap<>(); - newEntry.put(PauseSignalProperty.REASON.name(), reason); - newEntry.put(MaintenanceSignalProperty.TIMESTAMP.name(), Long.toString(timestamp)); - newEntry.put(MaintenanceSignalProperty.TRIGGERED_BY.name(), triggerEntityStr); - reasons.add(newEntry); - LOG.debug("Added new entry for entity: {}", triggeringEntity); - } + // Always add the new reason at the end of the list + Map newEntry = new HashMap<>(); + newEntry.put(PauseSignalProperty.REASON.name(), reason); + newEntry.put(MaintenanceSignalProperty.TIMESTAMP.name(), Long.toString(timestamp)); + newEntry.put(MaintenanceSignalProperty.TRIGGERED_BY.name(), triggeringEntity.name()); + filteredReasons.add(newEntry); - updateReasonsListField(reasons); - LOG.debug("After addition: Reasons list contains {} entries", reasons.size()); + updateReasonsListField(filteredReasons); + LOG.debug("After addition: Reasons list contains {} entries", filteredReasons.size()); } /** @@ -247,73 +236,77 @@ public boolean removeMaintenanceReason(TriggeringEntity triggeringEntity) { LOG.info("Removing maintenance reason for entity: {}", triggeringEntity); List> reasons = getMaintenanceReasons(); - - boolean entityExists = false; - for (Map entry : reasons) { - String entryEntity = entry.get(MaintenanceSignalProperty.TRIGGERED_BY.name()); - if (triggeringEntity.name().equals(entryEntity)) { - entityExists = true; - break; - } - } - - if (!entityExists) { + List> filteredReasons = filterReasons(reasons, null, + Arrays.asList(triggeringEntity)); + + // Return false early if no entities were filtered out + if (filteredReasons.size() == reasons.size()) { LOG.info("Entity {} doesn't have a maintenance reason entry, ignoring exit request", triggeringEntity); return false; } - int originalSize = reasons.size(); - LOG.debug("Before removal: Reasons list contains {} entries", reasons.size()); + // Update reasons list field with filtered reasons + updateReasonsListField(filteredReasons); - List> updatedReasons = new ArrayList<>(); - String targetEntity = triggeringEntity.name(); + // Always set/reset the simpleFields if filteredReasons.size() != 0 + if (!filteredReasons.isEmpty()) { + // Get the most recent reason (last element, since list is in chronological order) + Map mostRecent = filteredReasons.get(filteredReasons.size() - 1); + String newReason = mostRecent.get(PauseSignalProperty.REASON.name()); + long newTimestamp = Long.parseLong(mostRecent.get(MaintenanceSignalProperty.TIMESTAMP.name())); + TriggeringEntity newEntity = TriggeringEntity.valueOf( + mostRecent.get(MaintenanceSignalProperty.TRIGGERED_BY.name())); - // Only keep reasons that don't match the triggering entity - for (Map entry : reasons) { - String entryEntity = entry.get(MaintenanceSignalProperty.TRIGGERED_BY.name()); - if (!targetEntity.equals(entryEntity)) { - updatedReasons.add(entry); - } else { - LOG.debug("Removing entry with reason: {} for entity: {}", - entry.get(PauseSignalProperty.REASON.name()), entryEntity); - } + LOG.info("Updated to most recent reason: {}, entity: {}, timestamp: {}", + newReason, newEntity, newTimestamp); + + setReason(newReason); + setTimestamp(newTimestamp); + setTriggeringEntity(newEntity); } - boolean removed = updatedReasons.size() < originalSize; - LOG.debug("After removal: Reasons list contains {} entries", updatedReasons.size()); - - if (removed) { - updateReasonsListField(updatedReasons); - - // Update the simpleFields with the most recent reason (for backward compatibility) - if (!updatedReasons.isEmpty()) { - // Sort by timestamp in descending order to get the most recent - updatedReasons.sort((r1, r2) -> { - long t1 = Long.parseLong(r1.get(MaintenanceSignalProperty.TIMESTAMP.name())); - long t2 = Long.parseLong(r2.get(MaintenanceSignalProperty.TIMESTAMP.name())); - return Long.compare(t2, t1); - }); - - Map mostRecent = updatedReasons.get(0); - String newReason = mostRecent.get(PauseSignalProperty.REASON.name()); - long newTimestamp = Long.parseLong(mostRecent.get(MaintenanceSignalProperty.TIMESTAMP.name())); - TriggeringEntity newEntity = TriggeringEntity.valueOf( - mostRecent.get(MaintenanceSignalProperty.TRIGGERED_BY.name())); - - LOG.info("Updated to most recent reason: {}, entity: {}, timestamp: {}", - newReason, newEntity, newTimestamp); - - setReason(newReason); - setTimestamp(newTimestamp); - setTriggeringEntity(newEntity); + return true; + } + + /** + * Filter maintenance reasons based on include and exclude entity lists. + * + * @param reasons The original list of maintenance reasons + * @param includeEntities List of entities to include (null means include all) + * @param excludeEntities List of entities to exclude (null means exclude none) + * @return Filtered list of maintenance reasons (maintains original order) + */ + private List> filterReasons(List> reasons, + List includeEntities, + List excludeEntities) { + List> filtered = new ArrayList<>(); + + for (Map reason : reasons) { + String triggeredByStr = reason.get(MaintenanceSignalProperty.TRIGGERED_BY.name()); + if (triggeredByStr == null) { + continue; + } + + TriggeringEntity entity; + try { + entity = TriggeringEntity.valueOf(triggeredByStr); + } catch (IllegalArgumentException ex) { + LOG.warn("Unknown triggering entity: {}, skipping", triggeredByStr); + continue; + } + + if ((includeEntities != null && !includeEntities.contains(entity)) || + (excludeEntities != null && excludeEntities.contains(entity))) { + continue; } - } else { - LOG.info("No matching maintenance reason found for entity: {}", triggeringEntity); + + filtered.add(reason); } - return removed; + return filtered; } + /** * Check if there are any active maintenance reasons. * @@ -331,49 +324,14 @@ public boolean hasMaintenanceReasons() { */ public boolean hasMaintenanceReason(TriggeringEntity triggeringEntity) { List> reasons = getMaintenanceReasons(); - for (Map entry : reasons) { - if (triggeringEntity.name().equals(entry.get(MaintenanceSignalProperty.TRIGGERED_BY.name()))) { - return true; - } - } - return false; + List> filteredReasons = filterReasons(reasons, + Arrays.asList(triggeringEntity), null); + return !filteredReasons.isEmpty(); } - /** - * Gets the maintenance reason details for a specific triggering entity. - * - * @param triggeringEntity The entity to get reason details for - * @return Map containing reason details, or null if not found - */ - public Map getMaintenanceReasonDetails(TriggeringEntity triggeringEntity) { - List> reasons = getMaintenanceReasons(); - for (Map entry : reasons) { - if (triggeringEntity.name().equals(entry.get(MaintenanceSignalProperty.TRIGGERED_BY.name()))) { - return entry; - } - } - return null; - } - /** - * Gets the number of active maintenance reasons. - * - * @return The count of active maintenance reasons - */ - public int getMaintenanceReasonsCount() { - return getMaintenanceReasons().size(); - } - /** - * Gets the maintenance reason from a specific triggering entity. - * - * @param triggeringEntity The entity to get reason for - * @return The reason string, or null if not found - */ - public String getMaintenanceReason(TriggeringEntity triggeringEntity) { - Map details = getMaintenanceReasonDetails(triggeringEntity); - return details != null ? details.get(PauseSignalProperty.REASON.name()) : null; - } + /** * Reconcile legacy data from simpleFields into listFields.reasons if it's missing. @@ -390,23 +348,23 @@ public void reconcileLegacyData() { TriggeringEntity simpleEntity = getTriggeringEntity(); long simpleTimestamp = getTimestamp(); - // Only reconcile USER data from legacy clients - // CONTROLLER and AUTOMATION should not have legacy data loss scenarios - if (simpleReason != null && !simpleReason.isEmpty() && simpleEntity == TriggeringEntity.USER - && simpleTimestamp > 0 && !hasMaintenanceReason(simpleEntity)) { + // Early return if no simple reason exists, not a USER entity, or USER already has a reason + List> reasons = getMaintenanceReasons(); + if (simpleReason == null || simpleEntity != TriggeringEntity.USER || + !filterReasons(reasons, Arrays.asList(TriggeringEntity.USER), null).isEmpty()) { + return; + } - // Legacy USER data exists but not in listFields - preserve it - Map legacyEntry = new HashMap<>(); - legacyEntry.put(PauseSignalProperty.REASON.name(), simpleReason); - legacyEntry.put(MaintenanceSignalProperty.TIMESTAMP.name(), String.valueOf(simpleTimestamp)); - legacyEntry.put(MaintenanceSignalProperty.TRIGGERED_BY.name(), simpleEntity.name()); + // Legacy USER data exists but not in listFields - preserve it + Map legacyEntry = new HashMap<>(); + legacyEntry.put(PauseSignalProperty.REASON.name(), simpleReason); + legacyEntry.put(MaintenanceSignalProperty.TIMESTAMP.name(), String.valueOf(simpleTimestamp)); + legacyEntry.put(MaintenanceSignalProperty.TRIGGERED_BY.name(), simpleEntity.name()); - List> reasons = getMaintenanceReasons(); - reasons.add(legacyEntry); - updateReasonsListField(reasons); + reasons.add(legacyEntry); + updateReasonsListField(reasons); - LOG.info("Reconciled legacy USER maintenance data: reason={}, timestamp={}", - simpleReason, simpleTimestamp); - } + LOG.info("Reconciled legacy USER maintenance data: reason={}, timestamp={}", + simpleReason, simpleTimestamp); } } diff --git a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java index 4288179799..29561d5fd2 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java +++ b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.HashMap; +import java.util.List; import java.util.Map; import javax.management.MalformedObjectNameException; import javax.management.ObjectName; @@ -442,6 +443,23 @@ private static Map convertStringToMap(String value) throws IOExc TypeFactory.defaultInstance().constructMapType(HashMap.class, String.class, String.class)); } + /** + * Helper method to get maintenance reason for a specific triggering entity. + * @param signal The maintenance signal + * @param triggeringEntity The entity to get reason for + * @return The reason string, or null if not found + */ + private static String getMaintenanceReason(MaintenanceSignal signal, MaintenanceSignal.TriggeringEntity triggeringEntity) { + List> reasons = signal.getMaintenanceReasons(); + for (Map reason : reasons) { + String triggeredByStr = reason.get(MaintenanceSignal.MaintenanceSignalProperty.TRIGGERED_BY.name()); + if (triggeredByStr != null && MaintenanceSignal.TriggeringEntity.valueOf(triggeredByStr) == triggeringEntity) { + return reason.get(PauseSignal.PauseSignalProperty.REASON.name()); + } + } + return null; + } + /** * Test basic multi-actor stacking behavior. * Verifies core functionality: actor-based stacking, actor override, simpleFields most recent @@ -459,11 +477,11 @@ public void testAutomationMaintenanceMode() throws Exception { MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(signal); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 1); Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); Assert.assertEquals(signal.getReason(), "reason_A"); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); - Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), "reason_A"); + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.USER), "reason_A"); Thread.sleep(10); // Ensure different timestamps @@ -471,13 +489,13 @@ public void testAutomationMaintenanceMode() throws Exception { _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, "reason_B", null); signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.AUTOMATION); // Most recent Assert.assertEquals(signal.getReason(), "reason_B"); // Most recent Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); - Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), "reason_A"); - Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION), "reason_B"); + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.USER), "reason_A"); + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.AUTOMATION), "reason_B"); Thread.sleep(10); @@ -486,12 +504,12 @@ public void testAutomationMaintenanceMode() throws Exception { "reason_C", null); signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); // Still only 2 (USER overrode itself) + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); // Still only 2 (USER overrode itself) Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); // Most recent Assert.assertEquals(signal.getReason(), "reason_C"); // Most recent - Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.USER), "reason_C"); // Updated - Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION), + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.AUTOMATION), "reason_B"); // Unchanged Thread.sleep(10); @@ -501,12 +519,12 @@ public void testAutomationMaintenanceMode() throws Exception { "reason_D", null); signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); // Still only 2 (AUTOMATION overrode itself) + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); // Still only 2 (AUTOMATION overrode itself) Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.AUTOMATION); // Most recent Assert.assertEquals(signal.getReason(), "reason_D"); // Most recent - Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.USER), "reason_C"); // Unchanged - Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION), + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.AUTOMATION), "reason_D"); // Updated // Clean exit sequence: actors exit in order @@ -515,7 +533,7 @@ public void testAutomationMaintenanceMode() throws Exception { signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(signal); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 1); Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); Assert.assertEquals(signal.getReason(), "reason_D"); // Updated to remaining reason @@ -555,7 +573,7 @@ public void testLegacyClientCompatibility() throws Exception { // Verify multi-actor setup MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 3); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 3); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); @@ -574,7 +592,7 @@ public void testLegacyClientCompatibility() throws Exception { // Verify old client wiped listField data but simpleFields remain signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(signal); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 0, + Assert.assertEquals(signal.getMaintenanceReasons().size(), 0, "Old client should have wiped listField data"); Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); Assert.assertEquals(signal.getReason(), "reason_D"); @@ -588,7 +606,7 @@ public void testLegacyClientCompatibility() throws Exception { // Verify maintenance signal remains the same (no-op) signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(signal, "Should still be in maintenance after AUTOMATION no-op"); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 0); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 0); Assert.assertEquals(signal.getReason(), "reason_D"); Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); @@ -620,7 +638,7 @@ private void setupMultiActorMaintenanceScenario() throws Exception { // Verify maintenance signal with USER reason only MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(signal); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 1); Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); // Step 1: USER (UserA) puts the cluster into MM (t1) @@ -630,7 +648,7 @@ private void setupMultiActorMaintenanceScenario() throws Exception { // Verify maintenance signal with USER reason only signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(signal); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); // Step 2: AUTOMATION puts the cluster into MM (t2) @@ -640,7 +658,7 @@ private void setupMultiActorMaintenanceScenario() throws Exception { // Verify maintenance signal has both reasons signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 3); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 3); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); @@ -651,8 +669,8 @@ private void setupMultiActorMaintenanceScenario() throws Exception { // Verify maintenance signal still has same number of reasons but UserB's reason replaced UserA's signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 3); - Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), "reason_C"); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 3); + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.USER), "reason_C"); // Step 4: USER (Old Client) enters cluster into MM (t4) // Simulate old client by directly creating a MaintenanceSignal with simple fields only @@ -675,7 +693,7 @@ private void setupMultiActorMaintenanceScenario() throws Exception { Assert.assertEquals(signal.getReason(), "reason_D"); // Verify reasons list was wiped by old client (data loss accepted by design) - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 0, + Assert.assertEquals(signal.getMaintenanceReasons().size(), 0, "Old client should have wiped all listField data"); } @@ -694,7 +712,7 @@ public void testUserAdministrativeOverride() throws Exception { // Verify the old client has wiped the listField data MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(signal); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 0, + Assert.assertEquals(signal.getMaintenanceReasons().size(), 0, "Old client should have wiped listField data"); Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); Assert.assertEquals(signal.getReason(), "reason_D"); @@ -736,7 +754,7 @@ public void testOldClientDataLoss() throws Exception { Assert.assertEquals(signal.getReason(), "reason_D"); // But listFields.reasons should be empty (wiped by old client) - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 0, + Assert.assertEquals(signal.getMaintenanceReasons().size(), 0, "Old client should have wiped listField data"); Assert.assertFalse(signal.hasMaintenanceReasons(), "Should not have maintenance reasons after old client wipe"); @@ -747,7 +765,7 @@ public void testOldClientDataLoss() throws Exception { // Verify signal now has only AUTOMATION reason (no reconciliation of old USER data) signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER), "Controller data was lost"); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), @@ -755,7 +773,7 @@ public void testOldClientDataLoss() throws Exception { Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); // Verify the new AUTOMATION reason - Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION), + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.AUTOMATION), "reason_F"); // Exit the only remaining actor @@ -782,10 +800,10 @@ public void testAutomationReentryAfterDataLoss() throws Exception { // Verify signal now has only AUTOMATION reason (no reconciliation of wiped data) MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); - Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION), + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.AUTOMATION), "reason_E"); // Case B Step 7: USER exits MM -> should be administrative override since USER has no listField entry @@ -809,7 +827,7 @@ public void testOldClientDeletesEntireZnode() throws Exception { // Verify we have both actors MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); // Case D: USER (old client) exits MM by deleting entire znode _dataAccessor.removeProperty(_keyBuilder.maintenance()); @@ -861,7 +879,7 @@ public void testSimpleFieldsReflectMostRecent() throws Exception { Assert.assertEquals(signal.getReason(), "automation_fourth"); Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.AUTOMATION); // Should still have 2 actors (AUTOMATION entry was overwritten) - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); // Clean up _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, @@ -899,7 +917,7 @@ public void testEmptyStateEdgeCases() throws Exception { MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(signal); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 0, "Should have no listField reasons"); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 0, "Should have no listField reasons"); Assert.assertFalse(signal.hasMaintenanceReasons(), "Should report no maintenance reasons"); // AUTOMATION tries to exit -> should be no-op @@ -928,7 +946,7 @@ public void testMixedEntryExitScenarios() throws Exception { "automation_reason_1", null); MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); Assert.assertEquals(signal.getReason(), "automation_reason_1"); // Most recent // Phase 2: One actor exits, then re-enters with different reason @@ -937,7 +955,7 @@ public void testMixedEntryExitScenarios() throws Exception { Thread.sleep(10); signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 1); Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); @@ -946,7 +964,7 @@ public void testMixedEntryExitScenarios() throws Exception { "user_reason_2", null); signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); Assert.assertEquals(signal.getReason(), "user_reason_2"); // Most recent // Phase 3: Clean exit @@ -975,14 +993,14 @@ public void testMultiActorStackingWithController() throws Exception { MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(signal); Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 1); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); // Step 2: USER enters MM - should stack with CONTROLLER _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, "user_reason", null); signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); Assert.assertEquals(signal.getReason(), "user_reason"); // Most recent Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); @@ -993,7 +1011,7 @@ public void testMultiActorStackingWithController() throws Exception { "automation_reason", null); signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 3); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 3); Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.AUTOMATION); Assert.assertEquals(signal.getReason(), "automation_reason"); // Most recent Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); @@ -1006,7 +1024,7 @@ public void testMultiActorStackingWithController() throws Exception { signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(signal); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); @@ -1017,7 +1035,7 @@ public void testMultiActorStackingWithController() throws Exception { signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(signal); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 1); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); @@ -1057,7 +1075,7 @@ public void testControllerNoOpAfterOldClientWipe() throws Exception { "automation_reason", null); signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 3); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 3); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); @@ -1074,7 +1092,7 @@ public void testControllerNoOpAfterOldClientWipe() throws Exception { // Verify old client wiped listField data signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 0, + Assert.assertEquals(signal.getMaintenanceReasons().size(), 0, "Old client should have wiped listField data"); Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); @@ -1085,7 +1103,7 @@ public void testControllerNoOpAfterOldClientWipe() throws Exception { // Verify maintenance signal remains the same (no-op) signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(signal, "Should still be in maintenance after CONTROLLER no-op"); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 0); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 0); Assert.assertEquals(signal.getReason(), "Old client reason"); // Try AUTOMATION exit - should also be no-op since its entry was wiped @@ -1095,7 +1113,7 @@ public void testControllerNoOpAfterOldClientWipe() throws Exception { // Verify maintenance signal remains the same (no-op) signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(signal, "Should still be in maintenance after AUTOMATION no-op"); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 0); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 0); Assert.assertEquals(signal.getReason(), "Old client reason"); // USER tries to exit -> should be administrative override @@ -1125,7 +1143,7 @@ public void testControllerOverrideBehavior() throws Exception { Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); Assert.assertEquals(signal.getAutoTriggerReason(), MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 1); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 1); // Manually trigger CONTROLLER entry again with different reason (to test override) _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode(CLUSTER_NAME, true, @@ -1136,7 +1154,7 @@ public void testControllerOverrideBehavior() throws Exception { Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); Assert.assertEquals(signal.getAutoTriggerReason(), MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), + Assert.assertEquals(signal.getMaintenanceReasons().size(), 1); // Should still be only 1 (CONTROLLER overrode itself) // Add another actor to verify stacking still works @@ -1144,7 +1162,7 @@ public void testControllerOverrideBehavior() throws Exception { "user_reason", null); signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2); + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); @@ -1175,7 +1193,7 @@ public void testReconciliationOfLegacyUserData() throws Exception { // Verify old client state (no listFields reasons) MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertNotNull(signal); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 0, + Assert.assertEquals(signal.getMaintenanceReasons().size(), 0, "Old client should have no listField data"); Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); Assert.assertEquals(signal.getReason(), "legacy_user_reason"); @@ -1186,15 +1204,15 @@ public void testReconciliationOfLegacyUserData() throws Exception { // Step 3: Verify both reasons are preserved after reconciliation signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); - Assert.assertEquals(signal.getMaintenanceReasonsCount(), 2, + Assert.assertEquals(signal.getMaintenanceReasons().size(), 2, "Should have both reconciled USER and new AUTOMATION reasons"); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), "USER reason should be preserved"); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION), "AUTOMATION reason should be added"); - Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.USER), "legacy_user_reason"); - Assert.assertEquals(signal.getMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION), + Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.AUTOMATION), "automation_reason"); // Cleanup From b7ed0691c045792b3fa248d42ede1c6f758e7a9a Mon Sep 17 00:00:00 2001 From: LZD-PratyushBhatt Date: Wed, 16 Jul 2025 11:26:19 +0530 Subject: [PATCH 10/10] Make tests more robust --- .../apache/helix/manager/zk/ZKHelixAdmin.java | 3 +- .../TestClusterMaintenanceMode.java | 180 ++++++++++++++++-- 2 files changed, 160 insertions(+), 23 deletions(-) diff --git a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java index f4b2b639e3..91468ca847 100644 --- a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java +++ b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java @@ -1241,8 +1241,7 @@ private void processMaintenanceMode(String clusterName, final boolean enabled, if (removed) { // If there are still reasons for maintenance mode, update the ZNode - if (maintenanceSignal.getRecord().getListField("reasons") != null - && !maintenanceSignal.getRecord().getListField("reasons").isEmpty()) { + if (maintenanceSignal.hasMaintenanceReasons()) { if (!accessor.setProperty(keyBuilder.maintenance(), maintenanceSignal)) { throw new HelixException("Failed to update maintenance signal!"); } diff --git a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java index 29561d5fd2..c69c7913c4 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java +++ b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java @@ -460,17 +460,39 @@ private static String getMaintenanceReason(MaintenanceSignal signal, Maintenance return null; } + /** + * Utility method to verify maintenance history entry. + * @param expectedOperationType Expected operation type (ENTER/EXIT) + * @param expectedTriggeredBy Expected triggering entity (USER/AUTOMATION/CONTROLLER) + * @param expectedInMaintenanceAfterOperation Expected maintenance state after operation (true/false) + * @param expectedReason Expected reason (optional, can be null) + */ + private void verifyMaintenanceHistory(String expectedOperationType, String expectedTriggeredBy, + String expectedInMaintenanceAfterOperation, String expectedReason) throws Exception { + ControllerHistory history = _dataAccessor.getProperty(_keyBuilder.controllerLeaderHistory()); + Map lastEntry = convertStringToMap( + history.getMaintenanceHistoryList().get(history.getMaintenanceHistoryList().size() - 1)); + Assert.assertEquals(lastEntry.get("OPERATION_TYPE"), expectedOperationType); + Assert.assertEquals(lastEntry.get("TRIGGERED_BY"), expectedTriggeredBy); + Assert.assertEquals(lastEntry.get("IN_MAINTENANCE_AFTER_OPERATION"), expectedInMaintenanceAfterOperation); + if (expectedReason != null) { + Assert.assertEquals(lastEntry.get("REASON"), expectedReason); + } + } + /** * Test basic multi-actor stacking behavior. * Verifies core functionality: actor-based stacking, actor override, simpleFields most recent */ @Test public void testAutomationMaintenanceMode() throws Exception { + boolean result; ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); _manager.getConfigAccessor().setClusterConfig(CLUSTER_NAME, clusterConfig); _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); - TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + result = TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + Assert.assertTrue(result, "Should be out of maintenance mode."); // Step 1: USER enters MM (t1) with reason_A _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, "reason_A", null); @@ -483,6 +505,9 @@ public void testAutomationMaintenanceMode() throws Exception { Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.USER), "reason_A"); + // Verify history entry for USER entering maintenance + verifyMaintenanceHistory("ENTER", "USER", "true", "reason_A"); + Thread.sleep(10); // Ensure different timestamps // Step 2: AUTOMATION enters MM (t2) with reason_B - should stack with USER @@ -497,6 +522,9 @@ public void testAutomationMaintenanceMode() throws Exception { Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.USER), "reason_A"); Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.AUTOMATION), "reason_B"); + // Verify history entry for AUTOMATION entering maintenance + verifyMaintenanceHistory("ENTER", "AUTOMATION", "true", "reason_B"); + Thread.sleep(10); // Step 3: USER enters MM again (t3) with reason_C - should override previous USER entry @@ -512,6 +540,9 @@ public void testAutomationMaintenanceMode() throws Exception { Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.AUTOMATION), "reason_B"); // Unchanged + // Verify history entry for USER overriding previous entry + verifyMaintenanceHistory("ENTER", "USER", "true", "reason_C"); + Thread.sleep(10); // Step 4: AUTOMATION enters MM again (t4) with reason_D - should override previous AUTOMATION entry @@ -527,6 +558,9 @@ public void testAutomationMaintenanceMode() throws Exception { Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.AUTOMATION), "reason_D"); // Updated + // Verify history entry for AUTOMATION overriding previous entry + verifyMaintenanceHistory("ENTER", "AUTOMATION", "true", "reason_D"); + // Clean exit sequence: actors exit in order _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); @@ -538,9 +572,16 @@ public void testAutomationMaintenanceMode() throws Exception { Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); Assert.assertEquals(signal.getReason(), "reason_D"); // Updated to remaining reason + // Verify history entry for USER exiting maintenance (but still in maintenance due to AUTOMATION) + verifyMaintenanceHistory("EXIT", "USER", "true", null); + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); - TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + result = TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + Assert.assertTrue(result, "Should be completely out of maintenance mode."); + + // Verify history entry for AUTOMATION exiting maintenance (completely out) + verifyMaintenanceHistory("EXIT", "AUTOMATION", "false", null); } /** @@ -552,16 +593,18 @@ public void testAutomationMaintenanceMode() throws Exception { */ @Test public void testLegacyClientCompatibility() throws Exception { + boolean result; ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); clusterConfig.setMaxPartitionsPerInstance(-1); clusterConfig.setNumOfflineInstancesForAutoExit(-1); // Disable auto-exit to prevent race conditions _manager.getConfigAccessor().setClusterConfig(CLUSTER_NAME, clusterConfig); // Wait for config to be applied - TestHelper.verify(() -> { + result = TestHelper.verify(() -> { ClusterConfig currentConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); return currentConfig.getNumOfflineInstancesForAutoExit() == -1; }, 2000L); + Assert.assertTrue(result, "Config should be applied."); _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, "reason_A", null); Thread.sleep(10); @@ -578,6 +621,9 @@ public void testLegacyClientCompatibility() throws Exception { Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + // Verify history entry for AUTOMATION entering maintenance + verifyMaintenanceHistory("ENTER", "AUTOMATION", "true", "reason_C"); + // Simulate old client wiping listField data (only keeps simpleFields) Thread.sleep(10); ZNRecord record = new ZNRecord("maintenance"); @@ -610,12 +656,19 @@ public void testLegacyClientCompatibility() throws Exception { Assert.assertEquals(signal.getReason(), "reason_D"); Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + // Verify history entry for AUTOMATION no-op exit (still in maintenance) + verifyMaintenanceHistory("EXIT", "AUTOMATION", "true", null); + // USER tries to exit MM -> should trigger administrative override and delete maintenance ZNode _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); // Verify we're completely out of maintenance mode due to USER administrative override - TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + result = TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + Assert.assertTrue(result, "Should be completely out of maintenance mode due to USER administrative override."); + + // Verify in history that we're no longer in maintenance + verifyMaintenanceHistory("EXIT", "USER", "false", null); } /** @@ -705,6 +758,7 @@ private void setupMultiActorMaintenanceScenario() throws Exception { */ @Test public void testUserAdministrativeOverride() throws Exception { + boolean result; // Set up the initial state with all actors having entered maintenance mode // Note: setupMultiActorMaintenanceScenario() ends with old client wiping listField data setupMultiActorMaintenanceScenario(); @@ -723,15 +777,11 @@ public void testUserAdministrativeOverride() throws Exception { null, null); // Verify we're completely out of maintenance mode due to USER administrative override - TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + result = TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + Assert.assertTrue(result, "Should be completely out of maintenance mode due to USER administrative override."); // Verify in history that we're no longer in maintenance - ControllerHistory history = _dataAccessor.getProperty(_keyBuilder.controllerLeaderHistory()); - Map lastEntry = convertStringToMap( - history.getMaintenanceHistoryList().get(history.getMaintenanceHistoryList().size() - 1)); - Assert.assertEquals(lastEntry.get("OPERATION_TYPE"), "EXIT"); - Assert.assertEquals(lastEntry.get("TRIGGERED_BY"), "USER"); - Assert.assertEquals(lastEntry.get("IN_MAINTENANCE_AFTER_OPERATION"), "false"); + verifyMaintenanceHistory("EXIT", "USER", "false", null); } /** @@ -742,6 +792,7 @@ public void testUserAdministrativeOverride() throws Exception { */ @Test public void testOldClientDataLoss() throws Exception { + boolean result; // Set up the initial state with all actors having entered maintenance mode setupMultiActorMaintenanceScenario(); @@ -767,7 +818,7 @@ public void testOldClientDataLoss() throws Exception { signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER), - "Controller data was lost"); + "Controller data was not lost"); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER), "User data was lost"); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); @@ -776,12 +827,20 @@ public void testOldClientDataLoss() throws Exception { Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.AUTOMATION), "reason_F"); - // Exit the only remaining actor + // Verify history entry for AUTOMATION entering maintenance after data loss + verifyMaintenanceHistory("ENTER", "AUTOMATION", + "true", "reason_F"); + + // Exit the only remaining actors + _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, + null); _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); // Verify we're out of maintenance mode - TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + result = TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, + 2000L); + Assert.assertTrue(result, "Should be completely out of maintenance mode after AUTOMATION exit."); } /** @@ -791,14 +850,15 @@ public void testOldClientDataLoss() throws Exception { */ @Test public void testAutomationReentryAfterDataLoss() throws Exception { + boolean result; // Setup the multi-actor scenario which ends with old client wiping data setupMultiActorMaintenanceScenario(); - // Case B Step 6: AUTOMATION enters MM again (works independently, no reconciliation) + // AUTOMATION enters MM again (works independently, no reconciliation) _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, "reason_E", null); - // Verify signal now has only AUTOMATION reason (no reconciliation of wiped data) + // Verify signal now has both USER and AUTOMATION entries. MaintenanceSignal signal = _dataAccessor.getProperty(_keyBuilder.maintenance()); Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); @@ -806,12 +866,23 @@ public void testAutomationReentryAfterDataLoss() throws Exception { Assert.assertEquals(getMaintenanceReason(signal, MaintenanceSignal.TriggeringEntity.AUTOMATION), "reason_E"); - // Case B Step 7: USER exits MM -> should be administrative override since USER has no listField entry + // Verify history entry for AUTOMATION entering maintenance after data loss + verifyMaintenanceHistory("ENTER", "AUTOMATION", + "true", "reason_E"); + + + // Automation exits MM + _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, + null); + // USER exits MM _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); // Verify we're completely out of maintenance mode due to USER administrative override - TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + result = TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + Assert.assertTrue(result, "Should be out of maintenance mode."); + // Verify in history that we're no longer in maintenance + verifyMaintenanceHistory("EXIT", "USER", "false", null); } /** @@ -833,7 +904,9 @@ public void testOldClientDeletesEntireZnode() throws Exception { _dataAccessor.removeProperty(_keyBuilder.maintenance()); // Verify we're completely out of maintenance mode (all data lost) - TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + boolean result = TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, + 2000L); + Assert.assertTrue(result, "Should be completely out of maintenance mode after old client deletes znode."); } /** @@ -849,6 +922,9 @@ public void testSimpleFieldsReflectMostRecent() throws Exception { Assert.assertEquals(signal.getReason(), "user_first"); Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + // Verify history entry for USER entering maintenance + verifyMaintenanceHistory("ENTER", "USER", "true", "user_first"); + Thread.sleep(10); // Entry 2: AUTOMATION at t2 (should become most recent) @@ -859,6 +935,9 @@ public void testSimpleFieldsReflectMostRecent() throws Exception { Assert.assertEquals(signal.getReason(), "automation_second"); Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.AUTOMATION); + // Verify history entry for AUTOMATION entering maintenance + verifyMaintenanceHistory("ENTER", "AUTOMATION", "true", "automation_second"); + Thread.sleep(10); // Entry 3: USER at t3 (should become most recent) @@ -869,6 +948,9 @@ public void testSimpleFieldsReflectMostRecent() throws Exception { Assert.assertEquals(signal.getReason(), "user_third"); Assert.assertEquals(signal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.USER); + // Verify history entry for USER overriding previous entry + verifyMaintenanceHistory("ENTER", "USER", "true", "user_third"); + Thread.sleep(10); // Entry 4: AUTOMATION again at t4 (should become most recent, overriding its previous entry) @@ -881,12 +963,18 @@ public void testSimpleFieldsReflectMostRecent() throws Exception { // Should still have 2 actors (AUTOMATION entry was overwritten) Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); + // Verify history entry for AUTOMATION overriding previous entry + verifyMaintenanceHistory("ENTER", "AUTOMATION", "true", "automation_fourth"); + // Clean up _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + + // Verify in history that we're no longer in maintenance + verifyMaintenanceHistory("EXIT", "AUTOMATION", "false", null); } /** @@ -894,6 +982,7 @@ public void testSimpleFieldsReflectMostRecent() throws Exception { */ @Test public void testEmptyStateEdgeCases() throws Exception { + boolean result; // Test 1: Try to exit when no maintenance mode exists // AUTOMATION tries to exit when no MM exists -> should be no-op @@ -929,7 +1018,11 @@ public void testEmptyStateEdgeCases() throws Exception { // USER tries to exit -> should be administrative override _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); - TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + result = TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + Assert.assertTrue(result, "Should be completely out of maintenance mode."); + + // Verify in history that we're no longer in maintenance + verifyMaintenanceHistory("EXIT", "USER", "false", null); } /** @@ -938,6 +1031,7 @@ public void testEmptyStateEdgeCases() throws Exception { */ @Test public void testMixedEntryExitScenarios() throws Exception { + boolean result; // Phase 1: Multiple actors enter _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, "user_reason_1", null); @@ -949,6 +1043,9 @@ public void testMixedEntryExitScenarios() throws Exception { Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); Assert.assertEquals(signal.getReason(), "automation_reason_1"); // Most recent + // Verify history entry for AUTOMATION entering maintenance + verifyMaintenanceHistory("ENTER", "AUTOMATION", "true", "automation_reason_1"); + // Phase 2: One actor exits, then re-enters with different reason _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); // USER exits @@ -959,6 +1056,9 @@ public void testMixedEntryExitScenarios() throws Exception { Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + // Verify history entry for USER exiting maintenance (but still in maintenance due to AUTOMATION) + verifyMaintenanceHistory("EXIT", "USER", "true", null); + // USER re-enters with new reason _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, "user_reason_2", null); @@ -967,12 +1067,18 @@ public void testMixedEntryExitScenarios() throws Exception { Assert.assertEquals(signal.getMaintenanceReasons().size(), 2); Assert.assertEquals(signal.getReason(), "user_reason_2"); // Most recent + // Verify history entry for USER re-entering maintenance + verifyMaintenanceHistory("ENTER", "USER", "true", "user_reason_2"); + // Phase 3: Clean exit _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + + // Verify in history that we're no longer in maintenance + verifyMaintenanceHistory("EXIT", "AUTOMATION", "false", null); } /** @@ -981,6 +1087,7 @@ public void testMixedEntryExitScenarios() throws Exception { */ @Test public void testMultiActorStackingWithController() throws Exception { + boolean result; ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); clusterConfig.setMaxPartitionsPerInstance(-1); _manager.getConfigAccessor().setClusterConfig(CLUSTER_NAME, clusterConfig); @@ -996,6 +1103,9 @@ public void testMultiActorStackingWithController() throws Exception { Assert.assertEquals(signal.getMaintenanceReasons().size(), 1); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); + // Verify history entry for CONTROLLER entering maintenance + verifyMaintenanceHistory("ENTER", "CONTROLLER", "true", "Test controller maintenance"); + // Step 2: USER enters MM - should stack with CONTROLLER _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, true, "user_reason", null); @@ -1006,6 +1116,9 @@ public void testMultiActorStackingWithController() throws Exception { Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.CONTROLLER)); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); + // Verify history entry for USER entering maintenance + verifyMaintenanceHistory("ENTER", "USER", "true", "user_reason"); + // Step 3: AUTOMATION enters MM - should stack with both _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, true, "automation_reason", null); @@ -1018,6 +1131,9 @@ public void testMultiActorStackingWithController() throws Exception { Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + // Verify history entry for AUTOMATION entering maintenance + verifyMaintenanceHistory("ENTER", "AUTOMATION", "true", "automation_reason"); + // Step 4: USER exits - should remain in maintenance with CONTROLLER and AUTOMATION _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); @@ -1029,6 +1145,9 @@ public void testMultiActorStackingWithController() throws Exception { Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); Assert.assertTrue(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + // Verify history entry for USER exiting maintenance (but still in maintenance due to others) + verifyMaintenanceHistory("EXIT", "USER", "true", null); + // Step 5: AUTOMATION exits - should remain in maintenance with only CONTROLLER _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); @@ -1040,12 +1159,18 @@ public void testMultiActorStackingWithController() throws Exception { Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.USER)); Assert.assertFalse(signal.hasMaintenanceReason(MaintenanceSignal.TriggeringEntity.AUTOMATION)); + // Verify history entry for AUTOMATION exiting maintenance (but still in maintenance due to CONTROLLER) + verifyMaintenanceHistory("EXIT", "AUTOMATION", "true", null); + // Step 6: Exit CONTROLLER maintenance mode _gSetupTool.getClusterManagementTool().autoEnableMaintenanceMode(CLUSTER_NAME, false, null, MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); // Verify maintenance mode is completely off TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + + // Verify history entry for CONTROLLER exiting maintenance (completely out) + verifyMaintenanceHistory("EXIT", "CONTROLLER", "false", null); } /** @@ -1054,6 +1179,7 @@ public void testMultiActorStackingWithController() throws Exception { */ @Test public void testControllerNoOpAfterOldClientWipe() throws Exception { + boolean result; ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); clusterConfig.setMaxPartitionsPerInstance(-1); clusterConfig.setNumOfflineInstancesForAutoExit(-1); // Disable auto-exit to prevent race conditions @@ -1106,6 +1232,9 @@ public void testControllerNoOpAfterOldClientWipe() throws Exception { Assert.assertEquals(signal.getMaintenanceReasons().size(), 0); Assert.assertEquals(signal.getReason(), "Old client reason"); + // Verify history entry for CONTROLLER no-op exit (still in maintenance) + verifyMaintenanceHistory("EXIT", "CONTROLLER", "true", null); + // Try AUTOMATION exit - should also be no-op since its entry was wiped _gSetupTool.getClusterManagementTool().automationEnableMaintenanceMode(CLUSTER_NAME, false, null, null); @@ -1116,12 +1245,19 @@ public void testControllerNoOpAfterOldClientWipe() throws Exception { Assert.assertEquals(signal.getMaintenanceReasons().size(), 0); Assert.assertEquals(signal.getReason(), "Old client reason"); + // Verify history entry for AUTOMATION no-op exit (still in maintenance) + verifyMaintenanceHistory("EXIT", "AUTOMATION", "true", null); + // USER tries to exit -> should be administrative override _gSetupTool.getClusterManagementTool().manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); // Verify completely out of maintenance mode - TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + result = TestHelper.verify(() -> _dataAccessor.getProperty(_keyBuilder.maintenance()) == null, 2000L); + Assert.assertTrue(result, "Should be completely out of maintenance mode due to USER administrative override."); + + // Verify in history that we're no longer in maintenance + verifyMaintenanceHistory("EXIT", "USER", "false", null); } /** @@ -1129,6 +1265,7 @@ public void testControllerNoOpAfterOldClientWipe() throws Exception { */ @Test public void testControllerOverrideBehavior() throws Exception { + boolean result; ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); clusterConfig.setMaxPartitionsPerInstance(-1); _manager.getConfigAccessor().setClusterConfig(CLUSTER_NAME, clusterConfig); @@ -1181,6 +1318,7 @@ public void testControllerOverrideBehavior() throws Exception { */ @Test public void testReconciliationOfLegacyUserData() throws Exception { + boolean result; // Step 1: Old client sets simpleFields only (no listFields) ZNRecord record = new ZNRecord("maintenance"); record.setSimpleField(PauseSignal.PauseSignalProperty.REASON.name(), "legacy_user_reason");