Skip to content

Commit

Permalink
[PLAT-12910] Shutdown YBA on demotion
Browse files Browse the repository at this point in the history
Summary: If YBA is promoted while active is running a task, the task still completes on the active even though it is "failed" from the promoted instance's perspective. To solve this, we can also restart YBA during demotion so that any in progress tasks are aborted.

Test Plan: Promotion during Gflag upgrade, ensure task is failed on both sides.

Reviewers: nsingh, dshubin, sanketh

Reviewed By: nsingh

Subscribers: yugaware

Differential Revision: https://phorge.dev.yugabyte.com/D32911
  • Loading branch information
mchiddy committed Mar 13, 2024
1 parent c5b66cb commit 51e6595
Show file tree
Hide file tree
Showing 10 changed files with 76 additions and 25 deletions.
1 change: 1 addition & 0 deletions managed/RUNTIME-FLAGS.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@
| "XCluster isBootstrapRequired rpc max parallel threads" | "yb.xcluster.is_bootstrap_required_rpc_pool.max_threads" | "GLOBAL" | "Sets the maximum allowed number of threads to be run concurrently for xcluster isBootstrapRequired rpc" | "Integer" |
| "YBC poll upgrade result tries" | "ybc.upgrade.poll_result_tries" | "GLOBAL" | "YBC poll upgrade result tries count." | "Integer" |
| "YBC poll upgrade result Sleep time" | "ybc.upgrade.poll_result_sleep_ms" | "GLOBAL" | "YBC poll upgrade result sleep time." | "Long" |
| "HA Shutdown Level" | "yb.ha.shutdown_level" | "GLOBAL" | "When to shutdown - 0 for never, 1 for promotion, 2 for promotion and demotion" | "Integer" |
| "Clock Skew" | "yb.alert.max_clock_skew_ms" | "UNIVERSE" | "Default threshold for Clock Skew alert" | "Duration" |
| "Health Log Output" | "yb.health.logOutput" | "UNIVERSE" | "It determines whether to log the output of the node health check script to the console" | "Boolean" |
| "Node Checkout Time" | "yb.health.nodeCheckTimeoutSec" | "UNIVERSE" | "The timeout (in seconds) for node check operation as part of universe health check" | "Integer" |
Expand Down
29 changes: 29 additions & 0 deletions managed/src/main/java/com/yugabyte/yw/common/Util.java
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,35 @@ public static String escapeSingleQuotesOnly(String src) {
return src.replaceAll("'", "''");
}

public static void shutdownYbaProcess(int seconds) {
// Background thread to exit YBA process
Thread shutdownThread =
new Thread(
() -> {
try {
Thread.sleep(seconds * 1000 /* ms */);
LOG.info("Shutting down via system exit.");
System.exit(0);
} catch (InterruptedException e) {
LOG.warn("Interrupted during system exit.");
}
});
// Watcher thread to forcibly halt JVM if exit hangs
Thread haltThread =
new Thread(
() -> {
try {
shutdownThread.join((seconds * 1000) + 30000 /* add 30 seconds */);
LOG.info("Shutting down via halt.");
Runtime.getRuntime().halt(0);
} catch (InterruptedException e) {
LOG.warn("Interrupted during wait for exit.");
}
});
shutdownThread.start();
haltThread.start();
}

@VisibleForTesting
public static String removeEnclosingDoubleQuotes(String src) {
if (src != null && src.startsWith("\"") && src.endsWith("\"")) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1312,4 +1312,12 @@ public class GlobalConfKeys extends RuntimeConfigKeysModule {
"Enable Azure Provider quick validation",
ConfDataType.BooleanType,
ImmutableList.of(ConfKeyTags.INTERNAL));
public static final ConfKeyInfo<Integer> haShutdownLevel =
new ConfKeyInfo<>(
"yb.ha.shutdown_level",
ScopeType.GLOBAL,
"HA Shutdown Level",
"When to shutdown - 0 for never, 1 for promotion, 2 for promotion and demotion",
ConfDataType.IntegerType,
ImmutableList.of(ConfKeyTags.PUBLIC));
}
Original file line number Diff line number Diff line change
Expand Up @@ -127,13 +127,13 @@ public void syncInstances(long timestamp, JsonNode payload) {
}

/**
* calls {@link com.yugabyte.yw.controllers.InternalHAController#demoteLocalLeader(long
* timestamp)} on remote platform instance
* calls {@link com.yugabyte.yw.controllers.InternalHAController#demoteLocalLeader(long timestamp,
* boolean promote)} on remote platform instance
*/
public void demoteInstance(String localAddr, long timestamp) {
public void demoteInstance(String localAddr, long timestamp, boolean promote) {
ObjectNode formData = Json.newObject().put("leader_address", localAddr);
final JsonNode response =
this.makeRequest(this.controller.demoteLocalLeader(timestamp), formData);
this.makeRequest(this.controller.demoteLocalLeader(timestamp, promote), formData);
maybeGenerateVersionMismatchEvent(response.get("ybaVersion"));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ private void writeFederatedPrometheusConfig(String remoteAddr, File file, boolea
}
}

boolean demoteRemoteInstance(PlatformInstance remoteInstance) {
boolean demoteRemoteInstance(PlatformInstance remoteInstance, boolean promote) {
try {
if (remoteInstance.getIsLocal()) {
LOG.warn("Cannot perform demoteRemoteInstance action on a local instance");
Expand All @@ -261,7 +261,7 @@ boolean demoteRemoteInstance(PlatformInstance remoteInstance) {
localInstance -> {
// Send step down request to remote instance.
client.demoteInstance(
localInstance.getAddress(), config.getLastFailover().getTime());
localInstance.getAddress(), config.getLastFailover().getTime(), promote);

return true;
})
Expand Down Expand Up @@ -451,7 +451,7 @@ boolean syncToRemoteInstance(PlatformInstance remoteInstance) {
LOG.debug("Syncing data to " + remoteAddr + "...");

// Ensure that the remote instance is demoted if this instance is the most current leader.
if (!this.demoteRemoteInstance(remoteInstance)) {
if (!this.demoteRemoteInstance(remoteInstance, false)) {
LOG.error("Error demoting remote instance " + remoteAddr);
return false;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,12 @@ public void promoteLocalInstance(PlatformInstance newLeader) {
config.updateLastFailover();
// Attempt to ensure all remote instances are in follower state.
// Remotely demote any instance reporting to be a leader.
config.getRemoteInstances().forEach(replicationHelper::demoteRemoteInstance);
config
.getRemoteInstances()
.forEach(
instance -> {
replicationHelper.demoteRemoteInstance(instance, true);
});
// Promote the new local leader.
// we need to refresh because i.setIsLocalAndUpdate updated the underlying db bypassing
// newLeader bean.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import com.yugabyte.yw.common.ConfigHelper.ConfigType;
import com.yugabyte.yw.common.Util;
import com.yugabyte.yw.common.ValidatingFormFactory;
import com.yugabyte.yw.common.config.GlobalConfKeys;
import com.yugabyte.yw.common.config.RuntimeConfGetter;
import com.yugabyte.yw.common.ha.PlatformReplicationManager;
import com.yugabyte.yw.forms.DemoteInstanceFormData;
import com.yugabyte.yw.forms.PlatformResults;
Expand Down Expand Up @@ -43,6 +45,8 @@ public class InternalHAController extends Controller {

public static final Logger LOG = LoggerFactory.getLogger(InternalHAController.class);

@Inject private RuntimeConfGetter runtimeConfGetter;

private final PlatformReplicationManager replicationManager;
private final ValidatingFormFactory formFactory;
private final ConfigHelper configHelper;
Expand Down Expand Up @@ -189,7 +193,7 @@ public Result syncBackups(Http.Request request) throws Exception {
}
}

public Result demoteLocalLeader(long timestamp, Http.Request request) {
public Result demoteLocalLeader(long timestamp, boolean promote, Http.Request request) {
try {
Optional<HighAvailabilityConfig> config =
HighAvailabilityConfig.getByClusterKey(this.getClusterKey(request));
Expand Down Expand Up @@ -234,6 +238,11 @@ public Result demoteLocalLeader(long timestamp, Http.Request request) {

localInstance.get().setYbaVersion(version);

// Only restart YBA when demote comes from promote call, not from periodic sync
if (promote && runtimeConfGetter.getGlobalConf(GlobalConfKeys.haShutdownLevel) > 1) {
Util.shutdownYbaProcess(5);
}

return PlatformResults.withData(localInstance);
} catch (Exception e) {
LOG.error("Error demoting platform instance", e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
import com.yugabyte.yw.common.CustomerTaskManager;
import com.yugabyte.yw.common.PlatformServiceException;
import com.yugabyte.yw.common.Util;
import com.yugabyte.yw.common.config.GlobalConfKeys;
import com.yugabyte.yw.common.config.RuntimeConfGetter;
import com.yugabyte.yw.common.ha.PlatformReplicationManager;
import com.yugabyte.yw.common.rbac.PermissionInfo.Action;
import com.yugabyte.yw.common.rbac.PermissionInfo.ResourceType;
Expand Down Expand Up @@ -46,6 +48,8 @@ public class PlatformInstanceController extends AuthenticatedController {

@Inject private PlatformReplicationManager replicationManager;

@Inject private RuntimeConfGetter runtimeConfGetter;

@Inject CustomerTaskManager taskManager;

@AuthzPath({
Expand Down Expand Up @@ -256,18 +260,9 @@ public Result promoteInstance(
instanceUUID.toString(),
Audit.ActionType.Promote);

// Background thread to restart YBA
Thread shutdownThread =
new Thread(
() -> {
try {
Thread.sleep(5000);
System.exit(0);
} catch (InterruptedException e) {
LOG.warn("Interrupted during restart.");
}
});
shutdownThread.start();
if (runtimeConfGetter.getGlobalConf(GlobalConfKeys.haShutdownLevel) > 0) {
Util.shutdownYbaProcess(5);
}
return ok();
}
}
10 changes: 7 additions & 3 deletions managed/src/main/resources/reference.conf
Original file line number Diff line number Diff line change
Expand Up @@ -651,11 +651,15 @@ yb {
}
}
ha {
logScriptOutput = false
num_backup_retention = 10
prometheus_config_dir = "/prometheus_configs"
replication_schedule_enabled = false
replication_frequency = 30 minutes
prometheus_config_dir = "/prometheus_configs"
num_backup_retention = 10
logScriptOutput = false
# 0 - never shutdown
# 1 - only shutdown promoted instance
# 2 - shutdown both promoted and demoted instance
shutdown_level = 2
test_request_timeout = 10 seconds
test_connection_timeout = 10 seconds
ws = ${play.ws}
Expand Down
2 changes: 1 addition & 1 deletion managed/src/main/resources/v1.routes
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ GET /settings/ha/config/:cUUID/instance/local c

# HA API for inter-node communication
GET /settings/ha/internal/config com.yugabyte.yw.controllers.InternalHAController.getHAConfigByClusterKey(request: Request)
PUT /settings/ha/internal/config/demote/:timestamp com.yugabyte.yw.controllers.InternalHAController.demoteLocalLeader(timestamp: Long, request: Request)
PUT /settings/ha/internal/config/demote/:timestamp com.yugabyte.yw.controllers.InternalHAController.demoteLocalLeader(timestamp: Long, promote: java.lang.Boolean ?= false, request: Request)
PUT /settings/ha/internal/config/sync/:timestamp com.yugabyte.yw.controllers.InternalHAController.syncInstances(timestamp: Long, request: Request)
POST /settings/ha/internal/upload com.yugabyte.yw.controllers.InternalHAController.syncBackups(request: Request)

Expand Down

0 comments on commit 51e6595

Please sign in to comment.