Skip to content
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
5c20d5e
[JENKINS-38381] Calling Controller.watch API.
jglick Feb 9, 2018
b4ff3a5
Merge branch 'UTF-8-JENKINS-31096' into watch-plus-UTF-8-JENKINS-31096
jglick Feb 13, 2018
cb05308
Merge branch 'master' into watch-JENKINS-38381
jglick Jun 8, 2018
13885df
Matching https://github.com/jenkinsci/durable-task-plugin/pull/60/com…
jglick Jun 8, 2018
af31da4
Merge branch 'UTF-8-JENKINS-31096' into watch-plus-UTF-8-JENKINS-31096
jglick Jun 9, 2018
bd27dc6
Merge branch 'watch-JENKINS-38381' into watch-plus-UTF-8-JENKINS-31096
jglick Jun 9, 2018
5eccfbb
Review comments from @oleg-nenashev.
jglick Jun 9, 2018
756a87d
Merge branch 'UTF-8-JENKINS-31096' into watch-plus-UTF-8-JENKINS-31096
jglick Jun 9, 2018
6c424e0
Merge branch 'watch-JENKINS-38381' into watch-plus-UTF-8-JENKINS-31096
jglick Jun 9, 2018
0dcfd8e
Merge branch 'master' into watch-plus-UTF-8-JENKINS-31096
jglick Aug 7, 2018
12667dd
https://github.com/jenkinsci/workflow-job-plugin/pull/89 has already …
jglick Aug 7, 2018
e2536cc
Using a production value for WATCHING_RECURRENCE_PERIOD by default in…
jglick Aug 7, 2018
458d5ee
Added a feature flag for watching mode in case it needs to be disable…
jglick Aug 7, 2018
8440d04
FindBugs
jglick Aug 8, 2018
335d00f
Introduced awaitingAsynchExit flag to handle a possible corner case.
jglick Aug 8, 2018
9a094f0
Process.destroyForcibly should make for more colorful tests!
jglick Aug 8, 2018
38e5a73
Merge branch 'master' into watch-JENKINS-38381
jglick Aug 8, 2018
86d8074
Added ExecutorStepTest.shellOutputAcrossRestart to see what happens t…
jglick Aug 10, 2018
705b254
Merge branch 'jnlpProc' into watch-JENKINS-38381
jglick Aug 10, 2018
6722cdb
Noting a way to reproduce a loss of a small amount of output during r…
jglick Aug 10, 2018
62aeb45
Work around exception suppression from PrintStream.
jglick Aug 10, 2018
ed4551a
Verifying https://github.com/jenkinsci/durable-task-plugin/pull/60/co…
jglick Aug 10, 2018
c41f3ec
Comments.
jglick Aug 10, 2018
c859378
After discussion with @dwnusbaum, attempting to proceed after errors …
jglick Aug 10, 2018
3673025
Richer Javadoc.
jglick Aug 13, 2018
43b6d27
When writing directly to PrintStream.out, synchronize on it like Prin…
jglick Aug 14, 2018
24ee12b
Synchronization should be on the PrintStream, not its underlying Outp…
jglick Aug 14, 2018
3f8ced4
Fail with a more informative exception in case the PrintStream is clo…
jglick Aug 14, 2018
d54b065
Merge branch 'master' into watch-JENKINS-38381
jglick Aug 22, 2018
efb481c
Merge branch 'master' into watch-JENKINS-38381
jglick Sep 4, 2018
77b81c3
Bump.
jglick Sep 4, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
<dependency>
<groupId>org.jenkins-ci.plugins</groupId>
<artifactId>durable-task</artifactId>
<version>1.24</version>
<version>1.26-rc362.b077ce3ee7ca</version> <!-- TODO https://github.com/jenkinsci/durable-task-plugin/pull/60 -->
</dependency>
<dependency>
<groupId>org.jenkins-ci.plugins.workflow</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,21 @@
import hudson.AbortException;
import hudson.EnvVars;
import hudson.FilePath;
import hudson.Functions;
import hudson.Launcher;
import hudson.Util;
import hudson.model.TaskListener;
import hudson.remoting.ChannelClosedException;
import hudson.util.DaemonThreadFactory;
import hudson.util.FormValidation;
import hudson.util.LogTaskListener;
import hudson.util.NamingThreadFactory;
import java.io.FilterOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintStream;
import java.lang.reflect.Field;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Set;
Expand All @@ -48,8 +55,10 @@
import javax.annotation.CheckForNull;
import javax.annotation.Nonnull;
import jenkins.util.Timer;
import org.apache.commons.io.IOUtils;
import org.jenkinsci.plugins.durabletask.Controller;
import org.jenkinsci.plugins.durabletask.DurableTask;
import org.jenkinsci.plugins.durabletask.Handler;
import org.jenkinsci.plugins.workflow.FilePathUtils;
import org.jenkinsci.plugins.workflow.steps.AbstractStepExecutionImpl;
import org.jenkinsci.plugins.workflow.steps.Step;
Expand All @@ -60,6 +69,7 @@
import org.jenkinsci.plugins.workflow.support.concurrent.WithThreadName;
import org.kohsuke.accmod.Restricted;
import org.kohsuke.accmod.restrictions.DoNotUse;
import org.kohsuke.accmod.restrictions.NoExternalUse;
import org.kohsuke.stapler.DataBoundSetter;
import org.kohsuke.stapler.QueryParameter;

Expand Down Expand Up @@ -132,11 +142,25 @@ public FormValidation doCheckReturnStatus(@QueryParameter boolean returnStdout,

}

interface ExecutionRemotable {
void exited(int code, byte[] output) throws Exception;
void problem(Exception x);
}

// TODO this and the other constants could be made customizable via system property
@SuppressFBWarnings(value = "MS_SHOULD_BE_FINAL", justification = "public & mutable only for tests")
@Restricted(NoExternalUse.class)
public static long WATCHING_RECURRENCE_PERIOD = /* 5m */300_000;

/** If set to false, disables {@link Execution#watching} mode. */
@SuppressWarnings("FieldMayBeFinal")
private static boolean USE_WATCHING = !"false".equals(System.getProperty(DurableTaskStep.class.getName() + ".USE_WATCHING"));

/**
* Represents one task that is believed to still be running.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will flesh this out with an explanation of the expected state changes.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(done)

*/
@SuppressFBWarnings(value="SE_TRANSIENT_FIELD_NOT_RESTORED", justification="recurrencePeriod is set in onResume, not deserialization")
static final class Execution extends AbstractStepExecutionImpl implements Runnable {
static final class Execution extends AbstractStepExecutionImpl implements Runnable, ExecutionRemotable {

private static final long MIN_RECURRENCE_PERIOD = 250; // ¼s
private static final long MAX_RECURRENCE_PERIOD = 15000; // 15s
Expand All @@ -150,6 +174,10 @@ static final class Execution extends AbstractStepExecutionImpl implements Runnab

private transient final DurableTaskStep step;
private transient FilePath ws;
/**
* How many ms we plan to sleep before running {@link #check} again.
* Zero is used as a signal to break out of the loop.
*/
private transient long recurrencePeriod;
private transient volatile ScheduledFuture<?> task, stopTask;
private transient boolean printedCannotContactMessage;
Expand All @@ -158,6 +186,9 @@ static final class Execution extends AbstractStepExecutionImpl implements Runnab
private String remote;
private boolean returnStdout; // serialized default is false
private boolean returnStatus; // serialized default is false
private boolean watching; // serialized default is false
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is what makes us safe for running builds on upgrade.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right. For a sh step running across the upgrade, we behave in traditional polling mode.

/** Only used when {@link #watching}, if after {@link #WATCHING_RECURRENCE_PERIOD} comes around twice {@link #exited} has yet to be called. */
private transient boolean awaitingAsynchExit;

Execution(StepContext context, DurableTaskStep step) {
super(context);
Expand All @@ -174,14 +205,24 @@ static final class Execution extends AbstractStepExecutionImpl implements Runnab
if (returnStdout) {
durableTask.captureOutput();
}
TaskListener listener = context.get(TaskListener.class);
if (step.encoding != null) {
durableTask.charset(Charset.forName(step.encoding));
} else {
durableTask.defaultCharset();
}
controller = durableTask.launch(context.get(EnvVars.class), ws, context.get(Launcher.class), context.get(TaskListener.class));
controller = durableTask.launch(context.get(EnvVars.class), ws, context.get(Launcher.class), listener);
this.remote = ws.getRemote();
setupTimer();
if (USE_WATCHING) {
try {
controller.watch(ws, new HandlerImpl(this, ws, listener), listener);
watching = true;
} catch (UnsupportedOperationException x) {
LOGGER.log(Level.WARNING, /* default exception message suffices */null, x);
// and we fall back to polling mode
}
}
setupTimer(watching ? WATCHING_RECURRENCE_PERIOD : MIN_RECURRENCE_PERIOD);
return false;
}

Expand All @@ -192,26 +233,44 @@ static final class Execution extends AbstractStepExecutionImpl implements Runnab
LOGGER.log(Level.FINE, "Jenkins is not running, no such node {0}, or it is offline", node);
return null;
}
if (watching) {
try {
controller.watch(ws, new HandlerImpl(this, ws, listener()), listener());
recurrencePeriod = WATCHING_RECURRENCE_PERIOD;
} catch (UnsupportedOperationException x) {
// Should not happen, since it worked in start() and a given Controller should not have *dropped* support.
getContext().onFailure(x);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be useful to set watching = false and fall back to polling in this case? Is there even any way to get here without modifying the serialized state? I guess if the controller implementation supported watch, and then we changed the implementation and removed the override, then any pipelines that resume after upgrading this plugin would break, but that feels like a pretty contrived case. Maybe worth a comment that this should never happen IIUC.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it should never happen; can make that clearer in a comment.

} catch (Exception x) {
getWorkspaceProblem(x);
return null;
}
}
}
boolean directory;
try (Timeout timeout = Timeout.limit(10, TimeUnit.SECONDS)) {
directory = ws.isDirectory();
} catch (Exception x) {
// RequestAbortedException, ChannelClosedException, EOFException, wrappers thereof; InterruptedException if it just takes too long.
LOGGER.log(Level.FINE, node + " is evidently offline now", x);
ws = null;
if (!printedCannotContactMessage) {
listener().getLogger().println("Cannot contact " + node + ": " + x);
printedCannotContactMessage = true;
}
getWorkspaceProblem(x);
return null;
}
if (!directory) {
throw new AbortException("missing workspace " + remote + " on " + node);
}
LOGGER.log(Level.FINER, "{0} seems to be online so using {1}", new Object[] {node, remote});
return ws;
}

private void getWorkspaceProblem(Exception x) {
// RequestAbortedException, ChannelClosedException, EOFException, wrappers thereof; InterruptedException if it just takes too long.
LOGGER.log(Level.FINE, node + " is evidently offline now", x);
ws = null;
recurrencePeriod = MIN_RECURRENCE_PERIOD;
if (!printedCannotContactMessage) {
listener().getLogger().println("Cannot contact " + node + ": " + x);
printedCannotContactMessage = true;
}
}

private @Nonnull TaskListener listener() {
TaskListener l;
StepContext context = getContext();
Expand Down Expand Up @@ -252,6 +311,14 @@ static final class Execution extends AbstractStepExecutionImpl implements Runnab
recurrencePeriod = 0;
listener().getLogger().println("After 10s process did not stop");
getContext().onFailure(cause);
try {
FilePath workspace = getWorkspace();
if (workspace != null) {
controller.cleanup(workspace);
}
} catch (IOException | InterruptedException x) {
Functions.printStackTrace(x, listener().getLogger());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🐜 Might be worth mentioning that we failed to do workspace cleanup (so it's clear this is not super serious).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could do that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cancel that—at this point we have already printed an unusual message above, and any exception here probably came from getWorkspace.

}
}
}
}, 10, TimeUnit.SECONDS);
Expand Down Expand Up @@ -315,10 +382,23 @@ private void check() {
return;
}
if (workspace == null) {
recurrencePeriod = Math.min((long) (recurrencePeriod * RECURRENCE_PERIOD_BACKOFF), MAX_RECURRENCE_PERIOD);
return; // slave not yet ready, wait for another day
}
TaskListener listener = listener();
try (Timeout timeout = Timeout.limit(10, TimeUnit.SECONDS)) {
if (watching) {
Integer exitCode = controller.exitStatus(workspace, launcher());
if (exitCode == null) {
LOGGER.log(Level.FINE, "still running in {0} on {1}", new Object[] {remote, node});
} else if (awaitingAsynchExit) {
recurrencePeriod = 0;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe safer to have a slight nonzero delay?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, (exact) zero is the signal to avoid rescheduling.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, so it's a special signal value, normally that would be "-1".

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps. At any rate, zero has been used for this purpose from the start—not introduced in this patch. I can make that explicit in Javadoc.

getContext().onFailure(new AbortException("script apparently exited with code " + exitCode + " but asynchronous notification was lost"));
} else {
LOGGER.log(Level.FINE, "exited with {0} in {1} on {2}; expect asynchronous exit soon", new Object[] {exitCode, remote, node});
awaitingAsynchExit = true;
}
} else { // legacy mode
if (controller.writeLog(workspace, listener.getLogger())) {
getContext().saveState();
recurrencePeriod = MIN_RECURRENCE_PERIOD; // got output, maybe we will get more soon
Expand All @@ -343,6 +423,7 @@ private void check() {
recurrencePeriod = 0;
controller.cleanup(workspace);
}
}
} catch (Exception x) {
LOGGER.log(Level.FINE, "could not check " + workspace, x);
ws = null;
Expand All @@ -353,17 +434,111 @@ private void check() {
}
}

// called remotely from HandlerImpl
@Override public void exited(int exitCode, byte[] output) throws Exception {
try {
getContext().get(TaskListener.class);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we checking for TaskListener in the context in order to identify this condition?

I'll take my answer in the form of a code comment, please.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ack

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(done)

} catch (IOException | InterruptedException x) {
// E.g., CpsStepContext.doGet complaining that getThreadSynchronously() == null.
// If we cannot even print messages, there is no point proceeding.
LOGGER.log(Level.FINE, "asynchronous exit notification with code " + exitCode + " in " + remote + " on " + node + " ignored since step already seems dead", x);
return;
}
LOGGER.log(Level.FINE, "asynchronous exit notification with code {0} in {1} on {2}", new Object[] {exitCode, remote, node});
if (returnStdout && output == null) {
getContext().onFailure(new IllegalStateException("expected output but got none"));
return;
} else if (!returnStdout && output != null) {
getContext().onFailure(new IllegalStateException("did not expect output but got some"));
return;
}
recurrencePeriod = 0;
if (returnStatus || exitCode == 0) {
getContext().onSuccess(returnStatus ? exitCode : returnStdout ? new String(output, StandardCharsets.UTF_8) : null);
} else {
if (returnStdout) {
listener().getLogger().write(output); // diagnostic
}
getContext().onFailure(new AbortException("script returned exit code " + exitCode));
}
}

// ditto
@Override public void problem(Exception x) {
Functions.printStackTrace(x, listener().getLogger());
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems to work. Due to an apparent mistake in jenkinsci/pipeline-cloudwatch-logs-plugin#5 affecting agent-side, but not master-side, logging, I saw in the build log:

java.lang.NullPointerException
	at io.jenkins.plugins.pipeline_log_fluentd_cloudwatch.FluentdLogger$FluentdOutputStream.eol(FluentdLogger.java:128)
	at hudson.console.LineTransformationOutputStream.eol(LineTransformationOutputStream.java:60)
	at hudson.console.LineTransformationOutputStream.write(LineTransformationOutputStream.java:56)
	at hudson.console.LineTransformationOutputStream.write(LineTransformationOutputStream.java:74)
	at java.io.PrintStream.write(PrintStream.java:480)
	at org.jenkinsci.plugins.credentialsbinding.impl.BindingStep$Filter$1.eol(BindingStep.java:183)
	at hudson.console.LineTransformationOutputStream.eol(LineTransformationOutputStream.java:60)
	at hudson.console.LineTransformationOutputStream.write(LineTransformationOutputStream.java:56)
	at hudson.console.LineTransformationOutputStream.write(LineTransformationOutputStream.java:74)
	at org.apache.commons.io.IOUtils.copyLarge(IOUtils.java:1793)
	at org.apache.commons.io.IOUtils.copyLarge(IOUtils.java:1769)
	at org.apache.commons.io.IOUtils.copy(IOUtils.java:1744)
	at org.jenkinsci.plugins.workflow.steps.durable_task.DurableTaskStep$HandlerImpl.output(DurableTaskStep.java:571)
	at org.jenkinsci.plugins.durabletask.FileMonitoringTask$Watcher.run(FileMonitoringTask.java:477)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
	at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

with the build proceeding nonetheless.

// note that if there is _also_ a problem in the master-side logger, PrintStream will mask it
}

@Override public void onResume() {
setupTimer();
ws = null; // find it from scratch please
setupTimer(MIN_RECURRENCE_PERIOD);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does not look right when we were watching in the previous session; probably needs to again call Controller.watch (since the agent connection can be assumed to have been broken) and use WATCHING_RECURRENCE_PERIOD. The latter is not readily testable (just timing); the former did not trigger a failure in ExecutorStepTest.buildShellScriptAcrossRestart or buildShellScriptAcrossDisconnect, perhaps because these do not focus much on log output.

jenkinsci/durable-task-plugin#60 (comment) is something different: when we were not watching in the previous session, because the plugin was older. In that case the watching field should deserialize to false and we finish out the step with the old code path. ExecutorStepTest.serialForm exercises this scenario already.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Never mind, I forgot that getWorkspace does both of these things when resumed in watching mode.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jglick If you're the author and couldn't remember that, it's probably worth a code comment so future maintainers (cough) don't make the same assumption and waste time.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually did have a comment, but apparently need to expand it.

// In watch mode, we will quickly enter the check.
// Then in getWorkspace when ws == null we will start a watch and go back to sleep.
}

private void setupTimer() {
recurrencePeriod = MIN_RECURRENCE_PERIOD;
private void setupTimer(long initialRecurrencePeriod) {
recurrencePeriod = initialRecurrencePeriod;
task = THREAD_POOL.schedule(this, recurrencePeriod, TimeUnit.MILLISECONDS);
}

private static final long serialVersionUID = 1L;

}

private static class HandlerImpl extends Handler {

private static final Field printStreamDelegate;
static {
try {
printStreamDelegate = FilterOutputStream.class.getDeclaredField("out");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh, reflection hacks: the universal panacea!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately I could not find a better way to do this.

} catch (NoSuchFieldException x) {
// Defined in Java Platform and protected, so should not happen.
throw new ExceptionInInitializerError(x);
}
printStreamDelegate.setAccessible(true);
}

private static final long serialVersionUID = 1L;

private final ExecutionRemotable execution;
private final TaskListener listener;

HandlerImpl(Execution execution, FilePath workspace, TaskListener listener) {
this.execution = workspace.getChannel().export(ExecutionRemotable.class, execution);
this.listener = listener;
}

@Override public void output(InputStream stream) throws Exception {
PrintStream ps = listener.getLogger();
OutputStream os;
if (ps.getClass() == PrintStream.class) {
// Try to extract the underlying stream, since swallowing exceptions is undesirable and PrintStream.checkError is useless.
os = (OutputStream) printStreamDelegate.get(ps);
} else {
// A subclass. Who knows why, but trust any write(…) overrides it may have.
os = ps;
}
try {
IOUtils.copy(stream, os);
Copy link
Member

@dwnusbaum dwnusbaum Aug 13, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like all PrintStream#write overloads synchronize on this and null-check out before writing (since it is nulled out once the stream is closed). Do we need to do either of those here as well? This usage should always be from the same thread IIUC, but do other threads have access to listener.getLogger()?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, that sounds safer.

} catch (ChannelClosedException x) {
// We are giving up on this watch. Wait for some call to getWorkspace to rewatch.
throw x;
} catch (Exception x) {
// Try to report it to the master.
try {
execution.problem(x);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I predict we'll be seeing a lot of this particular report when there are network issues. Thankfully it won't be that often due to the between recurrence periods.

// OK, printed to log on master side, we may have lost some text but could continue.
} catch (Exception x2) { // e.g., RemotingSystemException
// No, channel seems to be broken, give up on this watch.
throw x;
}
}
}

@Override public void exited(int code, byte[] output) throws Exception {
execution.exited(code, output);
}

}

}
Loading