Skip to content

Commit 8e911a0

Browse files
author
ukumawat
committed
HBASE-28158 update crashed time, unit test and checkstyle fix
1 parent 77bb10c commit 8e911a0

File tree

9 files changed

+41
-17
lines changed

9 files changed

+41
-17
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,10 @@ synchronized void putIfAbsent(ServerName sn) {
7575
this.deadServers.putIfAbsent(sn, EnvironmentEdgeManager.currentTime());
7676
}
7777

78+
synchronized void putIfAbsent(ServerName sn, Long crashedTime) {
79+
this.deadServers.putIfAbsent(sn, crashedTime);
80+
}
81+
7882
public synchronized int size() {
7983
return deadServers.size();
8084
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1033,7 +1033,8 @@ private void finishActiveMasterInitialization() throws IOException, InterruptedE
10331033
Map<Class<?>, List<Procedure<MasterProcedureEnv>>> procsByType = procedureExecutor
10341034
.getActiveProceduresNoCopy().stream().collect(Collectors.groupingBy(p -> p.getClass()));
10351035

1036-
// This manager must be accessed AFTER hbase:meta is confirmed on line..
1036+
// This manager must be accessed AFTER hbase:meta is confirmed on line.. and must be initialised
1037+
// before assignment manager
10371038
this.tableStateManager = new TableStateManager(this);
10381039
// Create Assignment Manager
10391040
this.assignmentManager = createAssignmentManager(this, masterRegion);
@@ -1055,7 +1056,8 @@ private void finishActiveMasterInitialization() throws IOException, InterruptedE
10551056
// TODO: Generate the splitting and live Set in one pass instead of two as we currently do.
10561057
this.regionServerTracker.upgrade(
10571058
procsByType.getOrDefault(ServerCrashProcedure.class, Collections.emptyList()).stream()
1058-
.map(p -> (ServerCrashProcedure) p).map(p -> p.getServerName()).collect(Collectors.toSet()),
1059+
.map(p -> (ServerCrashProcedure) p).collect(
1060+
Collectors.toMap(ServerCrashProcedure::getServerName, Procedure::getSubmittedTime)),
10591061
Sets.union(rsListStorage.getAll(), walManager.getLiveServersFromWALDir()),
10601062
walManager.getSplittingServersFromWALDir());
10611063

hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import java.io.InterruptedIOException;
2525
import java.util.Collections;
2626
import java.util.List;
27+
import java.util.Map;
2728
import java.util.Set;
2829
import java.util.concurrent.ExecutorService;
2930
import java.util.concurrent.Executors;
@@ -116,22 +117,23 @@ private RegionServerInfo getServerInfo(ServerName serverName)
116117
* In this method, we will also construct the region server sets in {@link ServerManager}. If a
117118
* region server is dead between the crash of the previous master instance and the start of the
118119
* current master instance, we will schedule a SCP for it. This is done in
119-
* {@link ServerManager#findDeadServersAndProcess(Set, Set)}, we call it here under the lock
120+
* {@link ServerManager#findDeadServersAndProcess(Map, Set)}, we call it here under the lock
120121
* protection to prevent concurrency issues with server expiration operation.
121122
* @param deadServersFromPE the region servers which already have SCP associated.
122123
* @param liveServersBeforeRestart the live region servers we recorded before master restarts.
123124
* @param splittingServersFromWALDir Servers whose WALs are being actively 'split'.
124125
*/
125-
public void upgrade(Set<ServerName> deadServersFromPE, Set<ServerName> liveServersBeforeRestart,
126-
Set<ServerName> splittingServersFromWALDir) throws KeeperException, IOException {
126+
public void upgrade(Map<ServerName, Long> deadServersFromPE,
127+
Set<ServerName> liveServersBeforeRestart, Set<ServerName> splittingServersFromWALDir)
128+
throws KeeperException, IOException {
127129
LOG.info(
128130
"Upgrading RegionServerTracker to active master mode; {} have existing"
129131
+ "ServerCrashProcedures, {} possibly 'live' servers, and {} 'splitting'.",
130132
deadServersFromPE.size(), liveServersBeforeRestart.size(), splittingServersFromWALDir.size());
131133
// deadServersFromPE is made from a list of outstanding ServerCrashProcedures.
132134
// splittingServersFromWALDir are being actively split -- the directory in the FS ends in
133135
// '-SPLITTING'. Each splitting server should have a corresponding SCP. Log if not.
134-
splittingServersFromWALDir.stream().filter(s -> !deadServersFromPE.contains(s))
136+
splittingServersFromWALDir.stream().filter(s -> !deadServersFromPE.containsKey(s))
135137
.forEach(s -> LOG.error("{} has no matching ServerCrashProcedure", s));
136138
// create ServerNode for all possible live servers from wal directory and master local region
137139
liveServersBeforeRestart

hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -440,7 +440,7 @@ boolean checkAndRecordNewServer(final ServerName serverName, final ServerMetrics
440440
* @param deadServersFromPE the region servers which already have a SCP associated.
441441
* @param liveServersFromWALDir the live region servers from wal directory.
442442
*/
443-
void findDeadServersAndProcess(Set<ServerName> deadServersFromPE,
443+
void findDeadServersAndProcess(Map<ServerName, Long> deadServersFromPE,
444444
Set<ServerName> liveServersFromWALDir) {
445445
deadServersFromPE.forEach(deadservers::putIfAbsent);
446446
liveServersFromWALDir.stream().filter(sn -> !onlineServers.containsKey(sn))

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import java.util.Collection;
2424
import java.util.Collections;
2525
import java.util.Comparator;
26+
import java.util.Date;
2627
import java.util.HashMap;
2728
import java.util.HashSet;
2829
import java.util.List;
@@ -1882,7 +1883,16 @@ public void visitRegionState(Result result, final RegionInfo regionInfo, final S
18821883
}
18831884
// add regions to RIT while visiting the meta
18841885
regionInTransitionTracker.handleRegionStateNodeOperation(regionNode);
1885-
if (master.getServerManager().isServerDead(regionNode.getRegionLocation())) {
1886+
// If region location of region belongs to a dead server mark the region crashed
1887+
if (
1888+
regionNode.getRegionLocation() != null
1889+
&& master.getServerManager().isServerDead(regionNode.getRegionLocation())
1890+
) {
1891+
Date timeOfCrash =
1892+
master.getServerManager().getDeadServers().getTimeOfDeath(regionNode.getRegionLocation());
1893+
if (timeOfCrash != null) {
1894+
regionNode.crashed(timeOfCrash.getTime());
1895+
}
18861896
regionInTransitionTracker.regionCrashed(regionNode);
18871897
}
18881898
}
@@ -2317,10 +2327,12 @@ public CompletableFuture<Void> regionClosedAbnormally(RegionStateNode regionNode
23172327
// As soon as a server a crashed, region hosting on that are un-available, this method helps to
23182328
// track those un-available regions. This method can only be called from ServerCrashProcedure.
23192329
public void markRegionsAsCrashed(List<RegionInfo> regionsOnCrashedServer,
2320-
ServerName crashedServerName) {
2330+
ServerCrashProcedure scp) {
2331+
ServerName crashedServerName = scp.getServerName();
23212332
for (RegionInfo regionInfo : regionsOnCrashedServer) {
23222333
RegionStateNode node = regionStates.getOrCreateRegionStateNode(regionInfo);
23232334
if (node.getRegionLocation() == crashedServerName) {
2335+
node.crashed(scp.getSubmittedTime());
23242336
regionInTransitionTracker.regionCrashed(node);
23252337
}
23262338
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ public AssignmentProcedureEvent(final RegionInfo regionInfo) {
9696

9797
/**
9898
* Updated whenever a call to {@link #setRegionLocation(ServerName)} or
99-
* {@link #setState(RegionState.State, RegionState.State...)}.
99+
* {@link #setState(RegionState.State, RegionState.State...)} or {@link #crashed(long)}.
100100
*/
101101
private volatile long lastUpdate = 0;
102102

@@ -190,6 +190,10 @@ public void setLastHost(final ServerName serverName) {
190190
this.lastHost = serverName;
191191
}
192192

193+
public void crashed(long crashTime) {
194+
this.lastUpdate = crashTime;
195+
}
196+
193197
public void setOpenSeqNum(final long seqId) {
194198
this.openSeqNum = seqId;
195199
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -208,8 +208,7 @@ protected Flow executeFromState(MasterProcedureEnv env, ServerCrashState state)
208208
if (LOG.isTraceEnabled()) {
209209
this.regionsOnCrashedServer.stream().forEach(ri -> LOG.trace(ri.getShortNameToLog()));
210210
}
211-
env.getAssignmentManager().markRegionsAsCrashed(regionsOnCrashedServer,
212-
this.serverName);
211+
env.getAssignmentManager().markRegionsAsCrashed(regionsOnCrashedServer, this);
213212
}
214213
if (!this.shouldSplitWal) {
215214
setNextState(ServerCrashState.SERVER_CRASH_ASSIGN);

hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestFavoredStochasticLoadBalancer.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,9 @@ public void testBalancerWithoutFavoredNodes() throws Exception {
263263

264264
// Balancer should unassign the region
265265
assertTrue("Balancer did not run", admin.balance());
266-
TEST_UTIL.waitUntilNoRegionsInTransition();
266+
TEST_UTIL.waitUntilNoRegionTransitScheduled();
267+
assertEquals("One region should be unassigned", 1,
268+
master.getAssignmentManager().getRegionsInTransitionCount());
267269

268270
admin.assign(region.getEncodedNameAsBytes());
269271
TEST_UTIL.waitUntilNoRegionsInTransition(60000);

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1146,10 +1146,9 @@ public ReportRegionStateTransitionResponse reportRegionStateTransition(RpcContro
11461146
) {
11471147
AssignmentManager am = myMaster.getAssignmentManager();
11481148
for (RegionStateNode regionState : am.getRegionsInTransition()) {
1149-
/*
1150-
* TODO!!!! // Find the merging_new region and remove it if (regionState.isSplittingNew())
1151-
* { regionStates.deleteRegion(regionState.getRegion()); }
1152-
*/
1149+
if (regionState.toRegionState().isSplittingNew()) {
1150+
am.getRegionStates().deleteRegion(regionState.toRegionState().getRegion());
1151+
}
11531152
}
11541153
}
11551154
return resp;

0 commit comments

Comments
 (0)