-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[PLAT-14893] Add a new node action decommission node
Summary: Added a new node action Decommission, which is the equivalent of Remove + Release + Delete node actions. The restrictions on Decommission are the same as current Remove/Release 1. Either other nodes exist in this AZ or no tablets must be assigned to this node being decommissioned. 2. A final check is done to verify no tablets are assigned to this node before releasing the VM / deleting onprem data. 3. Cannot go below floor(RF/2) masters as a result of this operation. It is allowed to delete an entire AZ using this operation similar to current remove -> release -> delete. This will be addressed separately based on discussion. Test Plan: 1. Local provider test for removing 1 node from 3 node, RF1 2. Local provider test for removing 1 node w/ master from (2,1,1) RF3 cluster 3. Local provider test for removing 1 node from RR cluster 4. Tested on 4 / 6 node RF3 AWS/GCP universe on nodes with and without master 5. Verified that the action does not show up in UI and the API call fails for RF3 3 node universe Reviewers: cwang, nsingh, yshchetinin Reviewed By: cwang Subscribers: yugaware Differential Revision: https://phorge.dev.yugabyte.com/D37885
- Loading branch information
Showing
27 changed files
with
641 additions
and
309 deletions.
There are no files selected for viewing
171 changes: 171 additions & 0 deletions
171
managed/src/main/java/com/yugabyte/yw/commissioner/tasks/DecommissionNode.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
/* | ||
* Copyright 2024 YugaByte, Inc. and Contributors | ||
* | ||
* Licensed under the Polyform Free Trial License 1.0.0 (the "License"); you | ||
* may not use this file except in compliance with the License. You | ||
* may obtain a copy of the License at | ||
* | ||
* https://github.com/YugaByte/yugabyte-db/blob/master/licenses/POLYFORM-FREE-TRIAL-LICENSE-1.0.0.txt | ||
*/ | ||
|
||
package com.yugabyte.yw.commissioner.tasks; | ||
|
||
import com.yugabyte.yw.commissioner.BaseTaskDependencies; | ||
import com.yugabyte.yw.commissioner.ITask.Retryable; | ||
import com.yugabyte.yw.commissioner.UserTaskDetails.SubTaskGroupType; | ||
import com.yugabyte.yw.commissioner.tasks.params.NodeTaskParams; | ||
import com.yugabyte.yw.common.NodeActionType; | ||
import com.yugabyte.yw.common.config.UniverseConfKeys; | ||
import com.yugabyte.yw.forms.UniverseDefinitionTaskParams.Cluster; | ||
import com.yugabyte.yw.models.Universe; | ||
import com.yugabyte.yw.models.helpers.NodeDetails; | ||
import java.util.Set; | ||
import javax.inject.Inject; | ||
import lombok.extern.slf4j.Slf4j; | ||
|
||
@Slf4j | ||
@Retryable | ||
public class DecommissionNode extends EditUniverseTaskBase { | ||
|
||
@Inject | ||
protected DecommissionNode(BaseTaskDependencies baseTaskDependencies) { | ||
super(baseTaskDependencies); | ||
} | ||
|
||
@Override | ||
protected NodeTaskParams taskParams() { | ||
return (NodeTaskParams) taskParams; | ||
} | ||
|
||
private void runBasicChecks(Universe universe) { | ||
NodeDetails currentNode = universe.getNode(taskParams().nodeName); | ||
if (isFirstTry()) { | ||
currentNode.validateActionOnState(NodeActionType.DECOMMISSION); | ||
} | ||
} | ||
|
||
@Override | ||
public void validateParams(boolean isFirstTry) { | ||
super.validateParams(isFirstTry); | ||
runBasicChecks(getUniverse()); | ||
} | ||
|
||
// Check that there is a place to move the tablets and if not, make sure there are no tablets | ||
// assigned to this tserver. Otherwise, do not allow the remove node task to succeed. | ||
public void performPrecheck() { | ||
Universe universe = getUniverse(); | ||
NodeDetails currentNode = universe.getNode(taskParams().nodeName); | ||
|
||
if (!isTabletMovementAvailable(taskParams().nodeName)) { | ||
log.debug( | ||
"Tablets have nowhere to move off of tserver on node: {}. Checking if there are still" | ||
+ " tablets assigned to it. A healthy tserver should not be removed.", | ||
currentNode.getNodeName()); | ||
// TODO: Move this into a subtask. | ||
checkNoTabletsOnNode(universe, currentNode); | ||
} | ||
log.debug("Pre-check succeeded"); | ||
} | ||
|
||
@Override | ||
protected void createPrecheckTasks(Universe universe) { | ||
|
||
NodeDetails currentNode = universe.getNode(taskParams().nodeName); | ||
if (currentNode == null) { | ||
if (isFirstTry()) { | ||
String msg = | ||
"No node " + taskParams().nodeName + " found in universe " + universe.getName(); | ||
log.error(msg); | ||
throw new RuntimeException(msg); | ||
} else { | ||
// We might be here on a retry that actually deleted the node | ||
// don't do anything in this case | ||
return; | ||
} | ||
} | ||
|
||
if (isFirstTry()) { | ||
setToBeRemovedState(currentNode); | ||
configureTaskParams(universe); | ||
} | ||
|
||
// Check again after locking. | ||
runBasicChecks(getUniverse()); | ||
boolean alwaysWaitForDataMove = | ||
confGetter.getConfForScope(getUniverse(), UniverseConfKeys.alwaysWaitForDataMove); | ||
if (alwaysWaitForDataMove) { | ||
performPrecheck(); | ||
} | ||
addBasicPrecheckTasks(); | ||
} | ||
|
||
@Override | ||
public void run() { | ||
log.info( | ||
"Started {} task for node {} in univ uuid={}", | ||
getName(), | ||
taskParams().nodeName, | ||
taskParams().getUniverseUUID()); | ||
checkUniverseVersion(); | ||
|
||
Universe universe = getUniverse(); | ||
if (universe.getNode(taskParams().nodeName) == null) { | ||
log.info("No node found with name {}", taskParams().nodeName); | ||
if (isFirstTry()) { | ||
throw new RuntimeException( | ||
String.format("Node %s appears to have already been deleted", taskParams().nodeName)); | ||
} else { | ||
log.info("Completing task because no node {} found", taskParams().nodeName); | ||
} | ||
return; | ||
} | ||
|
||
universe = | ||
lockAndFreezeUniverseForUpdate( | ||
taskParams().expectedUniverseVersion, this::freezeUniverseInTxn); | ||
try { | ||
preTaskActions(); | ||
|
||
Cluster taskParamsCluster = taskParams().getClusterByNodeName(taskParams().nodeName); | ||
NodeDetails currentNode = universe.getNode(taskParams().nodeName); | ||
taskParams().azUuid = currentNode.azUuid; | ||
taskParams().placementUuid = currentNode.placementUuid; | ||
|
||
Set<NodeDetails> addedMasters = getAddedMasters(); | ||
Set<NodeDetails> removedMasters = getRemovedMasters(); | ||
|
||
// Update the cluster in memory. | ||
universe | ||
.getUniverseDetails() | ||
.upsertCluster( | ||
taskParamsCluster.userIntent, | ||
taskParamsCluster.placementInfo, | ||
taskParamsCluster.uuid); | ||
|
||
log.info("Decommission: added masters {}, removed masters {}", addedMasters, removedMasters); | ||
|
||
editCluster( | ||
universe, | ||
taskParams().clusters, | ||
taskParamsCluster, | ||
getNodesInCluster(taskParamsCluster.uuid, addedMasters), | ||
getNodesInCluster(taskParamsCluster.uuid, removedMasters), | ||
!addedMasters.isEmpty() || !removedMasters.isEmpty() /*updateMasters*/, | ||
true /* force */); | ||
|
||
createUpdateUniverseIntentTask(taskParamsCluster, true /*updatePlacementInfo*/); | ||
|
||
// Mark universe task state to success | ||
createMarkUniverseUpdateSuccessTasks().setSubTaskGroupType(SubTaskGroupType.RemovingNode); | ||
|
||
// Run all the tasks. | ||
getRunnableTask().runSubTasks(); | ||
} catch (Throwable t) { | ||
log.error("Error executing task {} with error='{}'.", getName(), t.getMessage(), t); | ||
throw t; | ||
} finally { | ||
unlockUniverseForUpdate(); | ||
} | ||
log.info("Finished {} task.", getName()); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.