-
Notifications
You must be signed in to change notification settings - Fork 241
Maintenance mode stacking support #3044
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
LZD-PratyushBhatt
wants to merge
10
commits into
apache:master
Choose a base branch
from
LZD-PratyushBhatt:maintenance_mode_stacking
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
b6d903a
Starter code for MM stacking
LZD-PratyushBhatt cdac22a
Reconciling properly during exit and entry
LZD-PratyushBhatt 42ba048
MM cleanup fix
LZD-PratyushBhatt b4f97ea
Add more cases for reconcilation cases
LZD-PratyushBhatt aa0c54d
remove unncessary depends
LZD-PratyushBhatt 68a6476
Fix the switch case
LZD-PratyushBhatt b6a081b
Remove unneccessary setting of simile fieldds
LZD-PratyushBhatt 36830c7
Revamp the code as per the final design
LZD-PratyushBhatt 4f829df
Address review comments
LZD-PratyushBhatt b7ed069
Make tests more robust
LZD-PratyushBhatt File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -948,8 +948,18 @@ public void enableMaintenanceMode(String clusterName, boolean enabled) { | |
| public boolean isInMaintenanceMode(String clusterName) { | ||
| HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); | ||
| PropertyKey.Builder keyBuilder = accessor.keyBuilder(); | ||
| return accessor.getBaseDataAccessor() | ||
| .exists(keyBuilder.maintenance().getPath(), AccessOption.PERSISTENT); | ||
|
|
||
| MaintenanceSignal signal = accessor.getProperty(keyBuilder.maintenance()); | ||
|
|
||
| if (signal == null) { | ||
| return false; | ||
| } | ||
|
|
||
| // The cluster is in maintenance mode if the maintenance signal ZNode exists | ||
| // This includes cases where old clients have wiped listField data but simpleFields remain | ||
| // cluster should remain in maintenance mode as long as ZNode exists | ||
| return signal.hasMaintenanceReasons() || | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we remove the check for the empty string, as this may break backward compatibility? |
||
| (signal.getReason() != null && !signal.getReason().isEmpty()); | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -1182,6 +1192,14 @@ public void manuallyEnableMaintenanceMode(String clusterName, boolean enabled, S | |
| MaintenanceSignal.TriggeringEntity.USER); | ||
| } | ||
|
|
||
| @Override | ||
| public void automationEnableMaintenanceMode(String clusterName, boolean enabled, String reason, | ||
| Map<String, String> customFields) { | ||
| processMaintenanceMode(clusterName, enabled, reason, | ||
| MaintenanceSignal.AutoTriggerReason.NOT_APPLICABLE, customFields, | ||
| MaintenanceSignal.TriggeringEntity.AUTOMATION); | ||
| } | ||
|
|
||
| /** | ||
| * Helper method for enabling/disabling maintenance mode. | ||
| * @param clusterName | ||
|
|
@@ -1201,23 +1219,74 @@ private void processMaintenanceMode(String clusterName, final boolean enabled, | |
| triggeringEntity == MaintenanceSignal.TriggeringEntity.CONTROLLER ? "automatically" | ||
| : "manually", enabled ? "enters" : "exits", reason == null ? "NULL" : reason); | ||
| final long currentTime = System.currentTimeMillis(); | ||
|
|
||
| MaintenanceSignal maintenanceSignal = accessor.getProperty(keyBuilder.maintenance()); | ||
| if (!enabled) { | ||
| // Exit maintenance mode | ||
| accessor.removeProperty(keyBuilder.maintenance()); | ||
| // Exit maintenance mode for this specific triggering entity | ||
|
|
||
| // Early return if no maintenance signal exists | ||
| if (maintenanceSignal == null) { | ||
| if (triggeringEntity == MaintenanceSignal.TriggeringEntity.USER) { | ||
| logger.info("USER administrative override: no maintenance signal exists, nothing to remove"); | ||
| } else { | ||
| // CONTROLLER/AUTOMATION: strict no-op | ||
| logger.info("Entity {} attempted to exit maintenance mode but no maintenance signal exists", triggeringEntity); | ||
| } | ||
| return; | ||
| } | ||
|
|
||
| // If a specific actor is exiting maintenance mode | ||
| boolean removed = maintenanceSignal.removeMaintenanceReason(triggeringEntity); | ||
|
|
||
| if (removed) { | ||
| // If there are still reasons for maintenance mode, update the ZNode | ||
|
|
||
| if (maintenanceSignal.hasMaintenanceReasons()) { | ||
| if (!accessor.setProperty(keyBuilder.maintenance(), maintenanceSignal)) { | ||
| throw new HelixException("Failed to update maintenance signal!"); | ||
| } | ||
| } else { | ||
| // If this was the last reason, remove the maintenance ZNode entirely | ||
| accessor.removeProperty(keyBuilder.maintenance()); | ||
| } | ||
| } else { | ||
| // Case where triggering entity doesn't have an entry | ||
| // Note: CONTROLLER/AUTOMATION is strict no-op, USER can do administrative override | ||
| if (triggeringEntity == MaintenanceSignal.TriggeringEntity.USER) { | ||
| // USER has special privilege to force exit maintenance mode as administrative override | ||
| logger.info("USER administrative override: forcefully exiting maintenance mode for cluster {}", clusterName); | ||
| accessor.removeProperty(keyBuilder.maintenance()); | ||
| } else { | ||
| // CONTROLLER/AUTOMATION: strict no-op if their entry not found | ||
| logger.info("Entity {} doesn't have a maintenance reason entry, exit request ignored", triggeringEntity); | ||
| } | ||
| } | ||
| } else { | ||
| // Enter maintenance mode | ||
| MaintenanceSignal maintenanceSignal = new MaintenanceSignal(MAINTENANCE_ZNODE_ID); | ||
| if (maintenanceSignal == null) { | ||
| // Create a new maintenance signal if it doesn't exist | ||
| maintenanceSignal = new MaintenanceSignal(MAINTENANCE_ZNODE_ID); | ||
| } | ||
|
|
||
| // This is CRITICAL: Reconcile any legacy data BEFORE updating simpleFields | ||
| // This must happen before any simpleField updates to preserve legacy USER data | ||
| maintenanceSignal.reconcileLegacyData(); | ||
|
|
||
| // Add the reason to the maintenance signal | ||
| if (reason != null) { | ||
| maintenanceSignal.setReason(reason); | ||
| } | ||
|
|
||
| maintenanceSignal.setTimestamp(currentTime); | ||
| maintenanceSignal.setTriggeringEntity(triggeringEntity); | ||
|
|
||
| switch (triggeringEntity) { | ||
| case CONTROLLER: | ||
| // autoEnable | ||
| maintenanceSignal.setAutoTriggerReason(internalReason); | ||
| break; | ||
| case USER: | ||
| case AUTOMATION: | ||
| case UNKNOWN: | ||
| // manuallyEnable | ||
| if (customFields != null && !customFields.isEmpty()) { | ||
|
|
@@ -1231,8 +1300,18 @@ private void processMaintenanceMode(String clusterName, final boolean enabled, | |
| } | ||
| break; | ||
| } | ||
| if (!accessor.createMaintenance(maintenanceSignal)) { | ||
| throw new HelixException("Failed to create maintenance signal!"); | ||
|
|
||
| // Add this reason to the multi-actor maintenance reasons list | ||
| maintenanceSignal.addMaintenanceReason(reason, currentTime, triggeringEntity); | ||
|
|
||
| if (accessor.getProperty(keyBuilder.maintenance()) == null) { | ||
| if (!accessor.createMaintenance(maintenanceSignal)) { | ||
| throw new HelixException("Failed to create maintenance signal!"); | ||
| } | ||
| } else { | ||
| if (!accessor.setProperty(keyBuilder.maintenance(), maintenanceSignal)) { | ||
| throw new HelixException("Failed to update maintenance signal!"); | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -1246,7 +1325,8 @@ private void processMaintenanceMode(String clusterName, final boolean enabled, | |
| } | ||
| return new ControllerHistory(oldRecord) | ||
| .updateMaintenanceHistory(enabled, reason, currentTime, internalReason, | ||
| customFields, triggeringEntity); | ||
| customFields, triggeringEntity, | ||
| isInMaintenanceMode(clusterName)); | ||
| } catch (IOException e) { | ||
| logger.error("Failed to update maintenance history! Exception: {}", e); | ||
| return oldRecord; | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are already 3 methods which similar name,
enableMM,autoEnableMM,manuallyEnableMMand nowautomationEnableMaintenanceMode.I have multiple queries here
autoEnableMMenableMMwith different triggering entities. Should we create an issue in apache helix as todo for this?