Skip to content
10 changes: 10 additions & 0 deletions helix-core/src/main/java/org/apache/helix/HelixAdmin.java
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,16 @@ void autoEnableMaintenanceMode(String clusterName, boolean enabled, String reaso
void manuallyEnableMaintenanceMode(String clusterName, boolean enabled, String reason,
Map<String, String> customFields);

/**
* Enable maintenance mode via automation systems (like HelixACM). To be called by automation services.
* @param clusterName
* @param enabled
* @param reason
* @param customFields user-specified KV mappings to be stored in the ZNode
*/
void automationEnableMaintenanceMode(String clusterName, boolean enabled, String reason,

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are already 3 methods which similar name, enableMM, autoEnableMM, manuallyEnableMM and now automationEnableMaintenanceMode.
I have multiple queries here

  1. Is there a reason to not why we are not overloading the new method with the existing name autoEnableMM
  2. IMO, there should only be one method enableMM with different triggering entities. Should we create an issue in apache helix as todo for this?

Map<String, String> customFields);

/**
* Check specific cluster is in maintenance mode or not
* @param clusterName the cluster name
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -948,8 +948,18 @@ public void enableMaintenanceMode(String clusterName, boolean enabled) {
public boolean isInMaintenanceMode(String clusterName) {
HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor);
PropertyKey.Builder keyBuilder = accessor.keyBuilder();
return accessor.getBaseDataAccessor()
.exists(keyBuilder.maintenance().getPath(), AccessOption.PERSISTENT);

MaintenanceSignal signal = accessor.getProperty(keyBuilder.maintenance());

if (signal == null) {
return false;
}

// The cluster is in maintenance mode if the maintenance signal ZNode exists
// This includes cases where old clients have wiped listField data but simpleFields remain
// cluster should remain in maintenance mode as long as ZNode exists
return signal.hasMaintenanceReasons() ||

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we remove the check for the empty string, as this may break backward compatibility?

(signal.getReason() != null && !signal.getReason().isEmpty());
}

@Override
Expand Down Expand Up @@ -1182,6 +1192,14 @@ public void manuallyEnableMaintenanceMode(String clusterName, boolean enabled, S
MaintenanceSignal.TriggeringEntity.USER);
}

@Override
public void automationEnableMaintenanceMode(String clusterName, boolean enabled, String reason,
Map<String, String> customFields) {
processMaintenanceMode(clusterName, enabled, reason,
MaintenanceSignal.AutoTriggerReason.NOT_APPLICABLE, customFields,
MaintenanceSignal.TriggeringEntity.AUTOMATION);
}

/**
* Helper method for enabling/disabling maintenance mode.
* @param clusterName
Expand All @@ -1201,23 +1219,74 @@ private void processMaintenanceMode(String clusterName, final boolean enabled,
triggeringEntity == MaintenanceSignal.TriggeringEntity.CONTROLLER ? "automatically"
: "manually", enabled ? "enters" : "exits", reason == null ? "NULL" : reason);
final long currentTime = System.currentTimeMillis();

MaintenanceSignal maintenanceSignal = accessor.getProperty(keyBuilder.maintenance());
if (!enabled) {
// Exit maintenance mode
accessor.removeProperty(keyBuilder.maintenance());
// Exit maintenance mode for this specific triggering entity

// Early return if no maintenance signal exists
if (maintenanceSignal == null) {
if (triggeringEntity == MaintenanceSignal.TriggeringEntity.USER) {
logger.info("USER administrative override: no maintenance signal exists, nothing to remove");
} else {
// CONTROLLER/AUTOMATION: strict no-op
logger.info("Entity {} attempted to exit maintenance mode but no maintenance signal exists", triggeringEntity);
}
return;
}

// If a specific actor is exiting maintenance mode
boolean removed = maintenanceSignal.removeMaintenanceReason(triggeringEntity);

if (removed) {
// If there are still reasons for maintenance mode, update the ZNode

if (maintenanceSignal.hasMaintenanceReasons()) {
if (!accessor.setProperty(keyBuilder.maintenance(), maintenanceSignal)) {
throw new HelixException("Failed to update maintenance signal!");
}
} else {
// If this was the last reason, remove the maintenance ZNode entirely
accessor.removeProperty(keyBuilder.maintenance());
}
} else {
// Case where triggering entity doesn't have an entry
// Note: CONTROLLER/AUTOMATION is strict no-op, USER can do administrative override
if (triggeringEntity == MaintenanceSignal.TriggeringEntity.USER) {
// USER has special privilege to force exit maintenance mode as administrative override
logger.info("USER administrative override: forcefully exiting maintenance mode for cluster {}", clusterName);
accessor.removeProperty(keyBuilder.maintenance());
} else {
// CONTROLLER/AUTOMATION: strict no-op if their entry not found
logger.info("Entity {} doesn't have a maintenance reason entry, exit request ignored", triggeringEntity);
}
}
} else {
// Enter maintenance mode
MaintenanceSignal maintenanceSignal = new MaintenanceSignal(MAINTENANCE_ZNODE_ID);
if (maintenanceSignal == null) {
// Create a new maintenance signal if it doesn't exist
maintenanceSignal = new MaintenanceSignal(MAINTENANCE_ZNODE_ID);
}

// This is CRITICAL: Reconcile any legacy data BEFORE updating simpleFields
// This must happen before any simpleField updates to preserve legacy USER data
maintenanceSignal.reconcileLegacyData();

// Add the reason to the maintenance signal
if (reason != null) {
maintenanceSignal.setReason(reason);
}

maintenanceSignal.setTimestamp(currentTime);
maintenanceSignal.setTriggeringEntity(triggeringEntity);

switch (triggeringEntity) {
case CONTROLLER:
// autoEnable
maintenanceSignal.setAutoTriggerReason(internalReason);
break;
case USER:
case AUTOMATION:
case UNKNOWN:
// manuallyEnable
if (customFields != null && !customFields.isEmpty()) {
Expand All @@ -1231,8 +1300,18 @@ private void processMaintenanceMode(String clusterName, final boolean enabled,
}
break;
}
if (!accessor.createMaintenance(maintenanceSignal)) {
throw new HelixException("Failed to create maintenance signal!");

// Add this reason to the multi-actor maintenance reasons list
maintenanceSignal.addMaintenanceReason(reason, currentTime, triggeringEntity);

if (accessor.getProperty(keyBuilder.maintenance()) == null) {
if (!accessor.createMaintenance(maintenanceSignal)) {
throw new HelixException("Failed to create maintenance signal!");
}
} else {
if (!accessor.setProperty(keyBuilder.maintenance(), maintenanceSignal)) {
throw new HelixException("Failed to update maintenance signal!");
}
}
}

Expand All @@ -1246,7 +1325,8 @@ private void processMaintenanceMode(String clusterName, final boolean enabled,
}
return new ControllerHistory(oldRecord)
.updateMaintenanceHistory(enabled, reason, currentTime, internalReason,
customFields, triggeringEntity);
customFields, triggeringEntity,
isInMaintenanceMode(clusterName));
} catch (IOException e) {
logger.error("Failed to update maintenance history! Exception: {}", e);
return oldRecord;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ private enum MaintenanceConfigKey {
MAINTENANCE_HISTORY,
OPERATION_TYPE,
DATE,
REASON

REASON,
IN_MAINTENANCE_AFTER_OPERATION
}

private enum ManagementModeConfigKey {
Expand Down Expand Up @@ -180,10 +180,11 @@ public ZNRecord updateManagementModeHistory(String controller, ClusterManagement
* @param internalReason
* @param customFields
* @param triggeringEntity
* @param inMaintenanceAfterOperation whether the cluster is still in maintenance mode after this operation
*/
public ZNRecord updateMaintenanceHistory(boolean enabled, String reason, long currentTime,
MaintenanceSignal.AutoTriggerReason internalReason, Map<String, String> customFields,
MaintenanceSignal.TriggeringEntity triggeringEntity) throws IOException {
MaintenanceSignal.TriggeringEntity triggeringEntity, boolean inMaintenanceAfterOperation) throws IOException {
DateFormat df = new SimpleDateFormat("yyyy-MM-dd-HH:" + "mm:ss");
df.setTimeZone(TimeZone.getTimeZone("UTC"));
String dateTime = df.format(new Date(currentTime));
Expand All @@ -198,6 +199,8 @@ public ZNRecord updateMaintenanceHistory(boolean enabled, String reason, long cu
String.valueOf(currentTime));
maintenanceEntry.put(MaintenanceSignal.MaintenanceSignalProperty.TRIGGERED_BY.name(),
triggeringEntity.name());
maintenanceEntry.put(MaintenanceConfigKey.IN_MAINTENANCE_AFTER_OPERATION.name(),
String.valueOf(inMaintenanceAfterOperation));
if (triggeringEntity == MaintenanceSignal.TriggeringEntity.CONTROLLER) {
// If auto-triggered
maintenanceEntry.put(MaintenanceSignal.MaintenanceSignalProperty.AUTO_TRIGGER_REASON.name(),
Expand Down
Loading