Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,22 @@ Map<String, Pair<Boolean, String>> getSafeModeRuleStatuses()
*/
boolean forceExitSafeMode() throws IOException;

/**
* Check if a specific SCM node is in safe mode.
* @param nodeId SCM node ID to query
* @return true if the node is in safe mode, false otherwise
* @throws IOException
*/
boolean inSafeModeForNode(String nodeId) throws IOException;

/**
* Get safe mode rule statuses from a specific SCM node.
* @param nodeId SCM node ID to query
* @return Map of rule name to rule status
* @throws IOException
*/
Map<String, Pair<Boolean, String>> getSafeModeRuleStatusesForNode(String nodeId) throws IOException;

/**
* Start ReplicationManager.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,14 @@ public interface StorageContainerLocationProtocol extends Closeable {
Type.StopReplicationManager,
Type.ForceExitSafeMode));

/**
* Read-only commands that can execute on followers without leader check.
* These commands respect the --scm parameter and query the specified SCM.
*/
Set<Type> FOLLOWER_READABLE_COMMAND_TYPE = Collections.unmodifiableSet(EnumSet.of(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: FOLLOWER_READABLE_COMMAND_TYPES

Type.InSafeMode,
Type.GetSafeModeRuleStatuses));

/**
* Asks SCM where a container should be allocated. SCM responds with the
* set of datanodes that should be used creating this container.
Expand Down Expand Up @@ -390,6 +398,26 @@ Map<String, Pair<Boolean, String>> getSafeModeRuleStatuses()
*/
boolean forceExitSafeMode() throws IOException;

/**
* Check if a specific SCM node is in safe mode.
* In HA clusters, queries the specified node.
*
* @param nodeId SCM node ID to query
* @return true if the node is in safe mode, false otherwise
* @throws IOException
*/
boolean inSafeModeForNode(String nodeId) throws IOException;

/**
* Get safe mode rule statuses from a specific SCM node.
* In HA clusters, queries the specified node.
*
* @param nodeId SCM node ID to query
* @return Map of rule name to rule status
* @throws IOException
*/
Map<String, Pair<Boolean, String>> getSafeModeRuleStatusesForNode(String nodeId) throws IOException;

/**
* Start ReplicationManager.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@
import org.apache.hadoop.ozone.upgrade.UpgradeFinalization.StatusAndMessages;
import org.apache.hadoop.ozone.util.ProtobufUtils;
import org.apache.hadoop.security.token.Token;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* This class is the client-side translator to translate the requests made on
Expand All @@ -162,6 +164,8 @@ public final class StorageContainerLocationProtocolClientSideTranslatorPB

private final StorageContainerLocationProtocolPB rpcProxy;
private final SCMContainerLocationFailoverProxyProvider fpp;
private static final Logger LOG =
LoggerFactory.getLogger(StorageContainerLocationProtocolClientSideTranslatorPB.class);

/**
* Creates a new StorageContainerLocationProtocolClientSideTranslatorPB.
Expand Down Expand Up @@ -870,6 +874,50 @@ public boolean forceExitSafeMode() throws IOException {

}

@Override
public boolean inSafeModeForNode(String nodeId) throws IOException {
InSafeModeRequestProto request = InSafeModeRequestProto.getDefaultInstance();

try {
StorageContainerLocationProtocolPB proxy = fpp.getProxyForNode(nodeId);
ScmContainerLocationRequest wrapper = ScmContainerLocationRequest.newBuilder()
.setCmdType(Type.InSafeMode)
.setVersion(ClientVersion.CURRENT_VERSION)
.setTraceID(TracingUtil.exportCurrentSpan())
.setInSafeModeRequest(request)
.build();
ScmContainerLocationResponse response = proxy.submitRequest(NULL_RPC_CONTROLLER, wrapper);
return response.getInSafeModeResponse().getInSafeMode();
} catch (Exception e) {
throw new IOException("Failed to get safe mode status from SCM node " + nodeId, e);
}
}

@Override
public Map<String, Pair<Boolean, String>> getSafeModeRuleStatusesForNode(String nodeId) throws IOException {
GetSafeModeRuleStatusesRequestProto request = GetSafeModeRuleStatusesRequestProto.getDefaultInstance();

try {
StorageContainerLocationProtocolPB proxy = fpp.getProxyForNode(nodeId);
ScmContainerLocationRequest wrapper = ScmContainerLocationRequest.newBuilder()
.setCmdType(Type.GetSafeModeRuleStatuses)
.setVersion(ClientVersion.CURRENT_VERSION)
.setTraceID(TracingUtil.exportCurrentSpan())
.setGetSafeModeRuleStatusesRequest(request)
.build();
ScmContainerLocationResponse response = proxy.submitRequest(NULL_RPC_CONTROLLER, wrapper);

Map<String, Pair<Boolean, String>> ruleStatuses = new HashMap<>();
for (SafeModeRuleStatusProto statusProto :
response.getGetSafeModeRuleStatusesResponse().getSafeModeRuleStatusesProtoList()) {
ruleStatuses.put(statusProto.getRuleName(), Pair.of(statusProto.getValidate(), statusProto.getStatusText()));
}
return ruleStatuses;
} catch (Exception e) {
throw new IOException("Failed to get safe mode rule statuses from SCM node " + nodeId, e);
}
}

@Override
public void startReplicationManager() throws IOException {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,17 @@ public synchronized List<T> getProxies() {
.map(proxyInfo -> proxyInfo.proxy).collect(Collectors.toList());
}

public synchronized T getProxyForNode(String nodeId) throws IOException {
ProxyInfo<T> proxyInfo = scmProxies.get(nodeId);
if (proxyInfo == null) {
if (!scmProxyInfoMap.containsKey(nodeId)) {
throw new IOException("Unknown SCM node ID: " + nodeId);
}
proxyInfo = createSCMProxy(nodeId);
}
return proxyInfo.proxy;
}

@Override
public synchronized void performFailover(T newLeader) {
if (updatedLeaderNodeID != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import static org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.Type.ListContainer;
import static org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.Type.ListPipelines;
import static org.apache.hadoop.hdds.scm.protocol.StorageContainerLocationProtocol.ADMIN_COMMAND_TYPE;
import static org.apache.hadoop.hdds.scm.protocol.StorageContainerLocationProtocol.FOLLOWER_READABLE_COMMAND_TYPE;

import com.google.protobuf.ProtocolMessageEnum;
import com.google.protobuf.RpcController;
Expand Down Expand Up @@ -212,7 +213,8 @@ public ScmContainerLocationResponse submitRequest(RpcController controller,
ScmContainerLocationRequest request) throws ServiceException {
// not leader or not belong to admin command.
if (!scm.checkLeader()
&& !ADMIN_COMMAND_TYPE.contains(request.getCmdType())) {
&& !ADMIN_COMMAND_TYPE.contains(request.getCmdType())
&& !FOLLOWER_READABLE_COMMAND_TYPE.contains(request.getCmdType())) {
RatisUtil.checkRatisException(
scm.getScmHAManager().getRatisServer().triggerNotLeaderException(),
scm.getClientRpcPort(), scm.getScmId(), scm.getHostname(), ROLE_TYPE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1033,6 +1033,24 @@ public Map<String, Pair<Boolean, String>> getSafeModeRuleStatuses()
}
}

@Override
public boolean inSafeModeForNode(String nodeId) throws IOException {
boolean result = inSafeMode();
AUDIT.logReadSuccess(
buildAuditMessageForSuccess(SCMAction.IN_SAFE_MODE, null)
);
return result;
}

@Override
public Map<String, Pair<Boolean, String>> getSafeModeRuleStatusesForNode(String nodeId) throws IOException {
Map<String, Pair<Boolean, String>> result = getSafeModeRuleStatuses();
AUDIT.logReadSuccess(
buildAuditMessageForSuccess(SCMAction.GET_SAFE_MODE_RULE_STATUSES, null)
);
return result;
}

/**
* Force SCM out of Safe mode.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,16 @@ public boolean forceExitSafeMode() throws IOException {
return storageContainerLocationClient.forceExitSafeMode();
}

@Override
public boolean inSafeModeForNode(String nodeId) throws IOException {
return storageContainerLocationClient.inSafeModeForNode(nodeId);
}

@Override
public Map<String, Pair<Boolean, String>> getSafeModeRuleStatusesForNode(String nodeId) throws IOException {
return storageContainerLocationClient.getSafeModeRuleStatusesForNode(nodeId);
}

@Override
public void startReplicationManager() throws IOException {
storageContainerLocationClient.startReplicationManager();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,19 @@
package org.apache.hadoop.hdds.scm.cli;

import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.hdds.HddsUtils;
import org.apache.hadoop.hdds.cli.HddsVersionProvider;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.scm.client.ScmClient;
import org.apache.hadoop.hdds.scm.ha.SCMNodeInfo;
import org.apache.hadoop.net.NetUtils;
import picocli.CommandLine;
import picocli.CommandLine.Command;

/**
Expand All @@ -33,9 +42,26 @@
mixinStandardHelpOptions = true,
versionProvider = HddsVersionProvider.class)
public class SafeModeCheckSubcommand extends ScmSubcommand {
@CommandLine.Option(names = {"--all", "-a"},
description = "Show safe mode status for all SCM nodes in the service. " +
"When multiple SCM service IDs are configured, --service-id must be specified.")
private boolean allNodes;

@Override
public void execute(ScmClient scmClient) throws IOException {
final OzoneConfiguration conf = getOzoneConf();
String serviceId = HddsUtils.getScmServiceId(conf);

if (allNodes) {
executeForAllNodes(scmClient);
} else if (StringUtils.isNotEmpty(getScmOption().getScm()) && serviceId != null) {
executeForSpecificNodeInHA(scmClient, serviceId);
} else {
executeForSingleNode(scmClient);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In normal or existing behaviour we need safemode status from leader node most of the time. When no scm address is passed, whether we are getting safe mode status from leader node or not? Because now follower also can accept safemode and can return the status.

Copy link
Contributor Author

@sreejasahithi sreejasahithi Jan 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @ashishkumar50 for finding this bug, you are right now that we are allowing follower to also accept status command there can be a possibility where when we run safemode status command with no additional option it can return the status of the follower.

I have fixed this issue.

}
}

private void executeForSingleNode(ScmClient scmClient) throws IOException {
boolean execReturn = scmClient.inSafeMode();

// Output data list
Expand All @@ -45,12 +71,106 @@ public void execute(ScmClient scmClient) throws IOException {
System.out.println("SCM is out of safe mode.");
}
if (isVerbose()) {
for (Map.Entry<String, Pair<Boolean, String>> entry :
scmClient.getSafeModeRuleStatuses().entrySet()) {
Pair<Boolean, String> value = entry.getValue();
System.out.printf("validated:%s, %s, %s%n",
value.getLeft(), entry.getKey(), value.getRight());
printSafeModeRules(scmClient.getSafeModeRuleStatuses());
}
}

private void executeForSpecificNodeInHA(ScmClient scmClient, String serviceId) throws IOException {
String scmAddress = getScmOption().getScm();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

scmAddress is not mandatory option.


System.out.println("Service ID: " + serviceId);

final OzoneConfiguration conf = getOzoneConf();

List<SCMNodeInfo> nodes = SCMNodeInfo.buildNodeInfo(conf);

// Find the node matching the --scm address
List<SCMNodeInfo> matchedNodes = nodes.stream()
.filter(node -> matchesAddress(node, scmAddress))
.collect(Collectors.toList());

if (matchedNodes.isEmpty()) {
throw new IOException("Specified --scm address " + scmAddress +
" does not match any node in service " + serviceId +
". Available nodes: " + nodes.stream()
.map(n -> n.getScmClientAddress() + " [" + n.getNodeId() + "]")
.collect(Collectors.joining(", ")));
}

queryNode(scmClient, matchedNodes.get(0));
}

private void executeForAllNodes(ScmClient scmClient) throws IOException {
final OzoneConfiguration conf = getOzoneConf();
String serviceId = HddsUtils.getScmServiceId(conf);

if (serviceId == null) {
executeForSingleNode(scmClient);
return;
}

System.out.println("Service ID: " + serviceId);
List<SCMNodeInfo> nodes = SCMNodeInfo.buildNodeInfo(conf);

for (SCMNodeInfo node : nodes) {
queryNode(scmClient, node);
}
}

private void queryNode(ScmClient scmClient, SCMNodeInfo node) {
String nodeId = node.getNodeId();

try {
boolean inSafeMode = scmClient.inSafeModeForNode(nodeId);

System.out.printf("%s [%s]: %s%n",
node.getScmClientAddress(),
nodeId,
inSafeMode ? "IN SAFE MODE" : "OUT OF SAFE MODE");

if (isVerbose()) {
Map<String, Pair<Boolean, String>> rules = scmClient.getSafeModeRuleStatusesForNode(nodeId);
if (rules != null && !rules.isEmpty()) {
printSafeModeRules(rules);
}
}
} catch (Exception e) {
System.out.printf("%s [%s]: ERROR: %s%n",
node.getScmClientAddress(), nodeId, e.getMessage());
}
}

/**
* Check if the given SCMNodeInfo matches the target address.
* Tries to match by direct string comparison and by resolved address.
*/
private boolean matchesAddress(SCMNodeInfo node, String targetAddress) {
String nodeAddress = node.getScmClientAddress();

// Direct match
if (nodeAddress.equals(targetAddress)) {
return true;
}

// Try normalizing both addresses and comparing
try {
InetSocketAddress target = NetUtils.createSocketAddr(targetAddress);
InetSocketAddress nodeAddr = NetUtils.createSocketAddr(nodeAddress);

// Match by resolved IP and port
return target.getPort() == nodeAddr.getPort() &&
target.getAddress().equals(nodeAddr.getAddress());
} catch (Exception e) {
// If address resolution fails, no match
return false;
Copy link
Contributor

@sadanand48 sadanand48 Jan 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit : Log the exception here before returning false

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have removed the logging here because it creates unwanted noise in the CLI output. Instead, I have ensured that actual errors are properly surfaced when no leader can be determined or when the node specified in --scm option doesn't match, clear error messages are thrown to the user.

}
}

private void printSafeModeRules(Map<String, Pair<Boolean, String>> rules) {
for (Map.Entry<String, Pair<Boolean, String>> entry : rules.entrySet()) {
Pair<Boolean, String> value = entry.getValue();
System.out.printf("validated:%s, %s, %s%n",
value.getLeft(), entry.getKey(), value.getRight());
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,8 @@ public SCMSecurityProtocol createScmSecurityClient() {
"Can't create SCM Security client", ex);
}
}

public String getScm() {
return scm;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ public abstract class ScmSubcommand extends AbstractSubcommand implements Callab

protected abstract void execute(ScmClient client) throws IOException;

protected ScmOption getScmOption() {
return scmOption;
}

@Override
public final Void call() throws Exception {
try (ScmClient scmClient = scmOption.createScmClient()) {
Expand Down