Skip to content

Commit 2a8e3ec

Browse files
pradeepppcPradeep Pallikila
and
Pradeep Pallikila
authored
[GOBBLIN-2052] release those containers which are running helix task that are stuck in any of the given state (apache#3932)
* remove container which runs helix task which is stuck in INIT state * refactored * refactored * refactored * refactored * refactored * added uts * gte event * fix uts * added flag to enable releasing of container or not * added feature flag enabling stuck task detection feature * added capability to check for tasks that are stuck in any given state * fixed build * added extra check to not have running state as stuck state * fixing build * resolved comments * resolved comments --------- Co-authored-by: Pradeep Pallikila <[email protected]>
1 parent ac653fb commit 2a8e3ec

File tree

6 files changed

+274
-39
lines changed

6 files changed

+274
-39
lines changed

gobblin-cluster/src/main/java/org/apache/gobblin/cluster/GobblinHelixConstants.java

+2
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,6 @@ public class GobblinHelixConstants {
2424

2525
public static final String SHUTDOWN_MESSAGE_TYPE = "SHUTDOWN";
2626

27+
public static final String HELIX_INSTANCE_NAME_KEY = "HelixInstanceName";
28+
2729
}

gobblin-yarn/src/main/java/org/apache/gobblin/yarn/GobblinYarnEventConstants.java

+1
Original file line numberDiff line numberDiff line change
@@ -45,5 +45,6 @@ public static class EventNames {
4545
public static final String ERROR = "Error";
4646
public static final String HELIX_INSTANCE_COMPLETION = "HelixInstanceCompletion";
4747
public static final String SHUTDOWN_REQUEST = "ShutdownRequest";
48+
public static final String HELIX_PARTITION_STUCK = "HelixPartitionStuck";
4849
}
4950
}

gobblin-yarn/src/main/java/org/apache/gobblin/yarn/YarnAutoScalingManager.java

+142-13
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import java.util.Comparator;
2323
import java.util.HashMap;
2424
import java.util.HashSet;
25+
import java.util.List;
2526
import java.util.Map;
2627
import java.util.Objects;
2728
import java.util.PriorityQueue;
@@ -32,7 +33,11 @@
3233
import java.util.stream.Collectors;
3334

3435
import org.apache.commons.compress.utils.Sets;
36+
37+
import org.apache.gobblin.cluster.GobblinHelixConstants;
3538
import org.apache.gobblin.stream.WorkUnitChangeEvent;
39+
40+
import org.apache.hadoop.yarn.api.records.Container;
3641
import org.apache.hadoop.yarn.api.records.Resource;
3742
import org.apache.helix.HelixDataAccessor;
3843
import org.apache.helix.HelixManager;
@@ -60,6 +65,7 @@
6065
import org.apache.gobblin.cluster.HelixUtils;
6166
import org.apache.gobblin.util.ConfigUtils;
6267
import org.apache.gobblin.util.ExecutorsUtils;
68+
import org.apache.gobblin.yarn.event.ContainerReleaseRequest;
6369

6470
import static org.apache.gobblin.yarn.GobblinYarnTaskRunner.HELIX_YARN_INSTANCE_NAME_PREFIX;
6571

@@ -71,8 +77,7 @@
7177
@Slf4j
7278
public class YarnAutoScalingManager extends AbstractIdleService {
7379
private final String AUTO_SCALING_PREFIX = GobblinYarnConfigurationKeys.GOBBLIN_YARN_PREFIX + "autoScaling.";
74-
private final String AUTO_SCALING_POLLING_INTERVAL_SECS =
75-
AUTO_SCALING_PREFIX + "pollingIntervalSeconds";
80+
private final String AUTO_SCALING_POLLING_INTERVAL_SECS = AUTO_SCALING_PREFIX + "pollingIntervalSeconds";
7681
private final String TASK_NUMBER_OF_ATTEMPTS_THRESHOLD = AUTO_SCALING_PREFIX + "taskAttemptsThreshold";
7782
private final int DEFAULT_TASK_NUMBER_OF_ATTEMPTS_THRESHOLD = 20;
7883
private final String SPLIT_WORKUNIT_REACH_ATTEMPTS_THRESHOLD = AUTO_SCALING_PREFIX + "splitWorkUnitReachThreshold";
@@ -82,21 +87,24 @@ public class YarnAutoScalingManager extends AbstractIdleService {
8287
private final String AUTO_SCALING_PARTITIONS_PER_CONTAINER = AUTO_SCALING_PREFIX + "partitionsPerContainer";
8388
private final int DEFAULT_AUTO_SCALING_PARTITIONS_PER_CONTAINER = 1;
8489
private final String AUTO_SCALING_CONTAINER_OVERPROVISION_FACTOR = AUTO_SCALING_PREFIX + "overProvisionFactor";
90+
private final String STUCK_TASK_CONTAINER_RELEASE_THRESHOLD_MINUTES =
91+
AUTO_SCALING_PREFIX + "stuckTaskContainerReleaseThresholdMinutes";
92+
private final String RELEASE_CONTAINER_IF_TASK_IS_STUCK = AUTO_SCALING_PREFIX + "releaseContainerIfTaskIsStuck";
93+
private final String DETECT_IF_TASK_IS_STUCK = AUTO_SCALING_PREFIX + "detectIfTaskIsStuck";
94+
private final String ENABLE_DETECTION_FOR_TASK_STATES = AUTO_SCALING_PREFIX + "enableDetectionForTaskStates";
8595
private final double DEFAULT_AUTO_SCALING_CONTAINER_OVERPROVISION_FACTOR = 1.0;
96+
private final String AUTO_SCALING_INITIAL_DELAY = AUTO_SCALING_PREFIX + "initialDelay";
97+
private final int DEFAULT_AUTO_SCALING_INITIAL_DELAY_SECS = 60;
98+
private final String AUTO_SCALING_WINDOW_SIZE = AUTO_SCALING_PREFIX + "windowSize";
99+
public final static int DEFAULT_MAX_CONTAINER_IDLE_TIME_BEFORE_SCALING_DOWN_MINUTES = 10;
100+
private final static int DEFAULT_MAX_TIME_MINUTES_TO_RELEASE_CONTAINER_HAVING_HELIX_TASK_THAT_IS_STUCK = 20;
101+
86102
// The cluster level default tags for Helix instances
87103
private final String defaultHelixInstanceTags;
88104
private final int defaultContainerMemoryMbs;
89105
private final int defaultContainerCores;
90-
91-
private final String AUTO_SCALING_INITIAL_DELAY = AUTO_SCALING_PREFIX + "initialDelay";
92-
private final int DEFAULT_AUTO_SCALING_INITIAL_DELAY_SECS = 60;
93106
private int taskAttemptsThreshold;
94107
private final boolean splitWorkUnitReachThreshold;
95-
96-
private final String AUTO_SCALING_WINDOW_SIZE = AUTO_SCALING_PREFIX + "windowSize";
97-
98-
public final static int DEFAULT_MAX_CONTAINER_IDLE_TIME_BEFORE_SCALING_DOWN_MINUTES = 10;
99-
100108
private final Config config;
101109
private final HelixManager helixManager;
102110
private final ScheduledExecutorService autoScalingExecutor;
@@ -105,6 +113,10 @@ public class YarnAutoScalingManager extends AbstractIdleService {
105113
private final double overProvisionFactor;
106114
private final SlidingWindowReservoir slidingFixedSizeWindow;
107115
private static int maxIdleTimeInMinutesBeforeScalingDown = DEFAULT_MAX_CONTAINER_IDLE_TIME_BEFORE_SCALING_DOWN_MINUTES;
116+
private final int maxTimeInMinutesBeforeReleasingContainerHavingStuckTask;
117+
private final boolean enableReleasingContainerHavingStuckTask;
118+
private final boolean enableDetectionStuckTask;
119+
private final HashSet<TaskPartitionState> detectionForStuckTaskStates;
108120
private static final HashSet<TaskPartitionState>
109121
UNUSUAL_HELIX_TASK_STATES = Sets.newHashSet(TaskPartitionState.ERROR, TaskPartitionState.DROPPED, TaskPartitionState.COMPLETED, TaskPartitionState.TIMED_OUT);
110122

@@ -136,6 +148,34 @@ public YarnAutoScalingManager(GobblinApplicationMaster appMaster) {
136148
DEFAULT_TASK_NUMBER_OF_ATTEMPTS_THRESHOLD);
137149
this.splitWorkUnitReachThreshold = ConfigUtils.getBoolean(this.config, SPLIT_WORKUNIT_REACH_ATTEMPTS_THRESHOLD,
138150
DEFAULT_SPLIT_WORKUNIT_REACH_ATTEMPTS_THRESHOLD);
151+
this.maxTimeInMinutesBeforeReleasingContainerHavingStuckTask = ConfigUtils.getInt(this.config,
152+
STUCK_TASK_CONTAINER_RELEASE_THRESHOLD_MINUTES,
153+
DEFAULT_MAX_TIME_MINUTES_TO_RELEASE_CONTAINER_HAVING_HELIX_TASK_THAT_IS_STUCK);
154+
this.enableReleasingContainerHavingStuckTask = ConfigUtils.getBoolean(this.config,
155+
RELEASE_CONTAINER_IF_TASK_IS_STUCK, false);
156+
this.enableDetectionStuckTask = ConfigUtils.getBoolean(this.config, DETECT_IF_TASK_IS_STUCK, false);
157+
this.detectionForStuckTaskStates = getTaskStatesForWhichDetectionIsEnabled();
158+
}
159+
160+
private HashSet<TaskPartitionState> getTaskStatesForWhichDetectionIsEnabled() {
161+
HashSet<TaskPartitionState> taskStates = new HashSet<>();
162+
if (this.enableDetectionStuckTask) {
163+
List<String> taskStatesEnabledForDetection = ConfigUtils.getStringList(this.config, ENABLE_DETECTION_FOR_TASK_STATES);
164+
for (String taskState : taskStatesEnabledForDetection) {
165+
try {
166+
TaskPartitionState helixTaskState = TaskPartitionState.valueOf(taskState);
167+
if(helixTaskState == TaskPartitionState.RUNNING) {
168+
log.warn("Running state is not allowed for detection as it is not a stuck state, ignoring");
169+
continue;
170+
}
171+
taskStates.add(helixTaskState);
172+
} catch (IllegalArgumentException e) {
173+
log.warn("Invalid task state {} provided for detection, ignoring", taskState);
174+
}
175+
}
176+
}
177+
log.info("Detection of task being stuck is enabled on following task states {}", taskStates);
178+
return taskStates;
139179
}
140180

141181
@Override
@@ -150,7 +190,9 @@ protected void startUp() {
150190
this.autoScalingExecutor.scheduleAtFixedRate(new YarnAutoScalingRunnable(new TaskDriver(this.helixManager),
151191
this.yarnService, this.partitionsPerContainer, this.overProvisionFactor,
152192
this.slidingFixedSizeWindow, this.helixManager.getHelixDataAccessor(), this.defaultHelixInstanceTags,
153-
this.defaultContainerMemoryMbs, this.defaultContainerCores, this.taskAttemptsThreshold, this.splitWorkUnitReachThreshold),
193+
this.defaultContainerMemoryMbs, this.defaultContainerCores, this.taskAttemptsThreshold,
194+
this.splitWorkUnitReachThreshold, this.maxTimeInMinutesBeforeReleasingContainerHavingStuckTask,
195+
this.enableReleasingContainerHavingStuckTask, this.enableDetectionStuckTask, this.detectionForStuckTaskStates),
154196
initialDelay, scheduleInterval, TimeUnit.SECONDS);
155197
}
156198

@@ -179,13 +221,22 @@ static class YarnAutoScalingRunnable implements Runnable {
179221
private final int defaultContainerCores;
180222
private final int taskAttemptsThreshold;
181223
private final boolean splitWorkUnitReachThreshold;
224+
private final int maxTimeInMinutesBeforeReleasingContainerHavingStuckTask;
225+
private final boolean enableReleasingContainerHavingStuckTask;
226+
private final boolean enableDetectionStuckTask;
227+
private final HashSet<TaskPartitionState> taskStates;
182228

183229
/**
184230
* A static map that keep track of an idle instance and its latest beginning idle time.
185231
* If an instance is no longer idle when inspected, it will be dropped from this map.
186232
*/
187233
private static final Map<String, Long> instanceIdleSince = new HashMap<>();
188-
234+
/**
235+
* A static nested map that keep track of an instances which contains the tasks which are present in any of the
236+
* configured states along with its latest beginning idle time in any of those states. If an instance is no longer
237+
* in the given states when inspected, it will be dropped from this map.
238+
*/
239+
private static final Map<String, Long> instanceStuckSince = new HashMap<>();
189240

190241
@Override
191242
public void run() {
@@ -219,13 +270,30 @@ private String getInuseParticipantForHelixPartition(JobContext jobContext, int p
219270
return null;
220271
}
221272

273+
274+
private String getParticipantInGivenStateForHelixPartition(final JobContext jobContext, final int partition,
275+
final HashSet<TaskPartitionState> taskStates) {
276+
if (taskStates.contains(jobContext.getPartitionState(partition))) {
277+
log.info("Helix task {} is in {} state at helix participant {}",
278+
jobContext.getTaskIdForPartition(partition), jobContext.getPartitionState(partition),
279+
jobContext.getAssignedParticipant(partition));
280+
return jobContext.getAssignedParticipant(partition);
281+
}
282+
283+
return null;
284+
}
285+
222286
/**
223287
* Iterate through the workflows configured in Helix to figure out the number of required partitions
224288
* and request the {@link YarnService} to scale to the desired number of containers.
225289
*/
226290
@VisibleForTesting
227291
void runInternal() {
228292
Set<String> inUseInstances = new HashSet<>();
293+
// helixInstancesContainingStuckTasks maintains the set of helix instances/participants containing tasks that are
294+
// stuck in any of the configured states.
295+
final Set<String> helixInstancesContainingStuckTasks = new HashSet<>();
296+
229297
YarnContainerRequestBundle yarnContainerRequestBundle = new YarnContainerRequestBundle();
230298
for (Map.Entry<String, WorkflowConfig> workFlowEntry : taskDriver.getWorkflows().entrySet()) {
231299
WorkflowContext workflowContext = taskDriver.getWorkflowContext(workFlowEntry.getKey());
@@ -259,6 +327,13 @@ void runInternal() {
259327
.map(i -> getInuseParticipantForHelixPartition(jobContext, i))
260328
.filter(Objects::nonNull).collect(Collectors.toSet()));
261329

330+
if (enableDetectionStuckTask) {
331+
// if feature is not enabled the set helixInstancesContainingStuckTasks will always be empty
332+
helixInstancesContainingStuckTasks.addAll(jobContext.getPartitionSet().stream()
333+
.map(helixPartition -> getParticipantInGivenStateForHelixPartition(jobContext, helixPartition, taskStates))
334+
.filter(Objects::nonNull).collect(Collectors.toSet()));
335+
}
336+
262337
numPartitions = jobContext.getPartitionSet().size();
263338
// Job level config for helix instance tags takes precedence over other tag configurations
264339
if (jobConfig != null) {
@@ -286,6 +361,8 @@ void runInternal() {
286361
// and potentially replanner-instance.
287362
Set<String> allParticipants = HelixUtils.getParticipants(helixDataAccessor, HELIX_YARN_INSTANCE_NAME_PREFIX);
288363

364+
final Set<Container> containersToRelease = new HashSet<>();
365+
289366
// Find all joined participants not in-use for this round of inspection.
290367
// If idle time is beyond tolerance, mark the instance as unused by assigning timestamp as -1.
291368
for (String participant : allParticipants) {
@@ -299,27 +376,79 @@ void runInternal() {
299376
// Remove this instance if existed in the tracking map.
300377
instanceIdleSince.remove(participant);
301378
}
379+
380+
if(helixInstancesContainingStuckTasks.contains(participant)) {
381+
instanceStuckSince.putIfAbsent(participant, System.currentTimeMillis());
382+
if (isInstanceStuck(participant)) {
383+
// release the corresponding container as the helix task is stuck for a long time
384+
log.info("Instance {} has some helix partition that is stuck for {} minutes, "
385+
+ "releasing the container enabled : {}", participant,
386+
TimeUnit.MILLISECONDS.toMinutes(System.currentTimeMillis() - instanceStuckSince.get(participant)),
387+
enableReleasingContainerHavingStuckTask);
388+
389+
// get container of the helix participant
390+
Optional<Container> container = yarnService.getContainerInfoGivenHelixParticipant(participant);
391+
instanceStuckSince.remove(participant);
392+
String containerId = "";
393+
if(container.isPresent()) {
394+
if (enableReleasingContainerHavingStuckTask) {
395+
containersToRelease.add(container.get());
396+
}
397+
containerId = container.get().getId().toString();
398+
} else {
399+
log.warn("Container information for participant {} is not found", participant);
400+
}
401+
402+
if(this.yarnService.getEventSubmitter().isPresent()) {
403+
// send GTE
404+
this.yarnService.getEventSubmitter().get().submit(GobblinYarnEventConstants.EventNames.HELIX_PARTITION_STUCK,
405+
GobblinHelixConstants.HELIX_INSTANCE_NAME_KEY, participant,
406+
GobblinYarnMetricTagNames.CONTAINER_ID, containerId);
407+
}
408+
}
409+
} else {
410+
instanceStuckSince.remove(participant);
411+
}
412+
}
413+
414+
// release the containers
415+
if(!containersToRelease.isEmpty()) {
416+
this.yarnService.getEventBus().post(new ContainerReleaseRequest(containersToRelease, true));
302417
}
418+
303419
slidingWindowReservoir.add(yarnContainerRequestBundle);
304420

421+
305422
log.debug("There are {} containers being requested in total, tag-count map {}, tag-resource map {}",
306423
yarnContainerRequestBundle.getTotalContainers(), yarnContainerRequestBundle.getHelixTagContainerCountMap(),
307424
yarnContainerRequestBundle.getHelixTagResourceMap());
308425

309426
this.yarnService.requestTargetNumberOfContainers(slidingWindowReservoir.getMax(), inUseInstances);
310427
}
311428

312-
@VisibleForTesting
313429
/**
314430
* Return true is the condition for tagging an instance as "unused" holds.
315431
* The condition, by default is that if an instance went back to
316432
* active (having partition running on it) within {@link #maxIdleTimeInMinutesBeforeScalingDown} minutes, we will
317433
* not tag that instance as "unused" and have that as the candidate for scaling down.
318434
*/
435+
@VisibleForTesting
319436
boolean isInstanceUnused(String participant){
320437
return System.currentTimeMillis() - instanceIdleSince.get(participant) >
321438
TimeUnit.MINUTES.toMillis(maxIdleTimeInMinutesBeforeScalingDown);
322439
}
440+
441+
/**
442+
* Return true is the condition for tagging an instance as stuck.
443+
* The condition, by default is that if a task running on an instance went back to any other state other than given
444+
* states within {@link #maxTimeInMinutesBeforeReleasingContainerHavingStuckTask} minutes, we will
445+
* not tag that instance as stuck and the container will not be scaled down.
446+
*/
447+
@VisibleForTesting
448+
boolean isInstanceStuck(final String participant) {
449+
return System.currentTimeMillis() - instanceStuckSince.get(participant) >
450+
TimeUnit.MINUTES.toMillis(maxTimeInMinutesBeforeReleasingContainerHavingStuckTask);
451+
}
323452
}
324453

325454
/**

gobblin-yarn/src/main/java/org/apache/gobblin/yarn/YarnService.java

+25-7
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ public class YarnService extends AbstractIdleService {
142142
private final FileSystem fs;
143143

144144
private final Optional<GobblinMetrics> gobblinMetrics;
145+
@Getter
145146
private final Optional<EventSubmitter> eventSubmitter;
146147

147148
@VisibleForTesting
@@ -313,6 +314,21 @@ public void handleNewContainerRequest(NewContainerRequest newContainerRequest) {
313314
newContainerRequest.getResource());
314315
}
315316

317+
/**
318+
* getContainerInfoGivenHelixParticipant returns the container of the given helixParticipant if it exists else
319+
* return Optional<Container>
320+
* @param helixParticipant
321+
* @return Container
322+
*/
323+
public Optional<Container> getContainerInfoGivenHelixParticipant(final String helixParticipant) {
324+
for (ContainerInfo containerInfo : this.containerMap.values()) {
325+
if (containerInfo.getHelixParticipantId().equals(helixParticipant)) {
326+
return Optional.fromNullable(containerInfo.getContainer());
327+
}
328+
}
329+
return Optional.absent();
330+
}
331+
316332
protected NMClientCallbackHandler getNMClientCallbackHandler() {
317333
return new NMClientCallbackHandler();
318334
}
@@ -335,13 +351,15 @@ public void handleContainerReleaseRequest(ContainerReleaseRequest containerRelea
335351
for (Container container : containerReleaseRequest.getContainers()) {
336352
LOGGER.info(String.format("Releasing container %s running on %s", container.getId(), container.getNodeId()));
337353

338-
// Record that this container was explicitly released so that a new one is not spawned to replace it
339-
// Put the container id in the releasedContainerCache before releasing it so that handleContainerCompletion()
340-
// can check for the container id and skip spawning a replacement container.
341-
// Note that this is the best effort since these are asynchronous operations and a container may abort concurrently
342-
// with the release call. So in some cases a replacement container may have already been spawned before
343-
// the container is put into the black list.
344-
this.releasedContainerCache.put(container.getId(), "");
354+
if(!containerReleaseRequest.isShouldSpinUpReplacementContainers()) {
355+
// Record that this container was explicitly released so that a new one is not spawned to replace it
356+
// Put the container id in the releasedContainerCache before releasing it so that handleContainerCompletion()
357+
// can check for the container id and skip spawning a replacement container.
358+
// Note that this is the best effort since these are asynchronous operations and a container may abort concurrently
359+
// with the release call. So in some cases a replacement container may have already been spawned before
360+
// the container is put into the black list.
361+
this.releasedContainerCache.put(container.getId(), "");
362+
}
345363
this.amrmClientAsync.releaseAssignedContainer(container.getId());
346364
}
347365
}

gobblin-yarn/src/main/java/org/apache/gobblin/yarn/event/ContainerReleaseRequest.java

+14-9
Original file line numberDiff line numberDiff line change
@@ -21,26 +21,31 @@
2121

2222
import org.apache.hadoop.yarn.api.records.Container;
2323

24+
import lombok.AllArgsConstructor;
25+
import lombok.Getter;
26+
2427

2528
/**
2629
* A type of event for container release requests to be used with a {@link com.google.common.eventbus.EventBus}.
2730
* This event is different than {@link ContainerShutdownRequest} because it releases the container through
2831
* the Resource Manager, while {@link ContainerShutdownRequest} shuts down a container through the
2932
* Node Manager
3033
*/
34+
@Getter
35+
@AllArgsConstructor
3136
public class ContainerReleaseRequest {
32-
private final Collection<Container> containers;
33-
34-
public ContainerReleaseRequest(Collection<Container> containers) {
35-
this.containers = containers;
36-
}
37-
3837
/**
39-
* Get the IDs of the containers to release.
38+
* -- GETTER --
39+
* Get the IDs of the containers to release.
4040
*
4141
* @return the IDs of the containers to release
4242
*/
43-
public Collection<Container> getContainers() {
44-
return this.containers;
43+
private final Collection<Container> containers;
44+
// shouldSpinUpReplacementContainers is used to indicate whether to replace the released containers with new ones or not
45+
private final boolean shouldSpinUpReplacementContainers;
46+
47+
public ContainerReleaseRequest(Collection<Container> containers) {
48+
this.containers = containers;
49+
this.shouldSpinUpReplacementContainers = false;
4550
}
4651
}

0 commit comments

Comments
 (0)