Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
a7a926e
Enable persistence of the done field to provide protection against mi…
svanoort Mar 16, 2018
7b2ba86
Hardening and logging for abnormal cases with onLoad and incomplete p…
svanoort Mar 16, 2018
c028682
Findbugs fixes
svanoort Mar 16, 2018
046e27a
More logging and done flags
svanoort Mar 17, 2018
d33c90c
Testcases to verify final build state
svanoort Mar 17, 2018
1ad3eb4
Diagnostic logging when setNewHead called with null head
svanoort Mar 19, 2018
cf829bb
Downgrade one warning log to info
svanoort Mar 19, 2018
a017be9
Diagnostics fix
svanoort Mar 19, 2018
5d08569
[JENKINS-50407] Trying to diagnose an NPE.
jglick Mar 26, 2018
4c545d6
Mention issue number in exception for easy tracking.
jglick Mar 26, 2018
1f47c90
Merge branch 'master' into diag-JENKINS-50407
jglick Mar 27, 2018
b34e1f1
Consume latest WF-job snapshot and amend logging
svanoort Mar 30, 2018
ab88c6d
Merge remote-tracking branch 'jglick/diag-JENKINS-50407' into fix-jen…
svanoort Mar 30, 2018
f4aad61
More errorproofing in FlowHead
svanoort Mar 30, 2018
16dcb35
Apply the noted review changes
svanoort Apr 3, 2018
1a279ef
Bump wf-job version to version with more hardening
svanoort Apr 3, 2018
5aa15b5
More robustness enhancements from fuzzing and extend/enhance fuzzing …
svanoort Apr 4, 2018
b91d775
Fix durability mode setting in test
svanoort Apr 5, 2018
e69a24c
Additional robustness improvements, plus a bit more test coverage
svanoort Apr 5, 2018
9d8f0b1
Ensure that Program state always has its FlowNodes fully persisted be…
svanoort Apr 5, 2018
95d7ee2
Add additional testcase Javadocs
svanoort Apr 5, 2018
a9c52d6
Missing ignore for fuzzer test
svanoort Apr 5, 2018
a4110d2
Update test harness dep to latest snapshot
svanoort Apr 5, 2018
773855b
Reduce one fuzz test timing so it cannot time out
svanoort Apr 6, 2018
91212d2
Fix one error logging that missed the exception
svanoort Apr 6, 2018
5ecfd2c
Update to test harness
svanoort Apr 6, 2018
1360644
Consume final workflow-job released version
svanoort Apr 8, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
<workflow-support-plugin.version>2.17</workflow-support-plugin.version>
<scm-api-plugin.version>2.0.8</scm-api-plugin.version>
<groovy-cps.version>1.24</groovy-cps.version>
<jenkins-test-harness.version>2.33</jenkins-test-harness.version>
<jenkins-test-harness.version>2.37</jenkins-test-harness.version>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

file a request to update the parent POM

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

</properties>
<dependencies>
<dependency>
Expand Down Expand Up @@ -141,7 +141,7 @@
<dependency>
<groupId>org.jenkins-ci.plugins.workflow</groupId>
<artifactId>workflow-job</artifactId>
<version>2.18-20180329.215807-4</version>
<version>2.18-20180406.172304-8</version>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please link to upstream PRs

<scope>test</scope>
</dependency>
<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ public class CpsFlowExecution extends FlowExecution implements BlockableResume {
* {@link FlowExecution} gets loaded into memory for the build records that have been completed,
* and for those we don't want to load the program state, so that check should be efficient.
*/
Boolean done; // Only non-private for unit test use.
boolean done; // Only non-private for unit test use.

/**
* Groovy compiler with CPS+sandbox transformation correctly setup.
Expand Down Expand Up @@ -728,7 +728,7 @@ public void onLoad(FlowExecutionOwner owner) throws IOException {
LOGGER.log(Level.WARNING, "Pipeline state not properly persisted, cannot resume "+owner.getUrl());
throw new IOException("Cannot resume build -- was not cleanly saved when Jenkins shut down.");
}
} else if (done == Boolean.TRUE && !super.isComplete()) {
} else if (done && !super.isComplete()) {
LOGGER.log(Level.WARNING, "Completed flow without FlowEndNode: "+this+" heads:"+getHeadsAsString());
}
} catch (Exception e) { // Multicatch ensures that failure to load does not nuke the master
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Expand Down Expand Up @@ -820,7 +820,6 @@ private void loadProgramFailed(final Throwable problem, SettableFuture<CpsThread
} else {
head = getFirstHead();
}
done = Boolean.TRUE;
}

if (head==null) {
Expand Down Expand Up @@ -857,13 +856,12 @@ private void loadProgramFailed(final Throwable problem, SettableFuture<CpsThread
/** Report a fatal error in the VM. */
void croak(Throwable t) {
setResult(Result.FAILURE);
done = Boolean.TRUE;
onProgramEnd(new Outcome(null, t));
cleanUpHeap();
try {
saveOwner();
} catch (Exception ex) {
LOGGER.log(Level.WARNING, "Failed to persist WorkflowRun after noting a serious failure for run:", owner);
LOGGER.log(Level.WARNING, "Failed to persist WorkflowRun after noting a serious failure for run: " + owner, ex);
}
}

Expand Down Expand Up @@ -1193,7 +1191,7 @@ public static void maybeAutoPersistNode(@Nonnull FlowNode node) {
@Override
@SuppressFBWarnings(value = "RC_REF_COMPARISON_BAD_PRACTICE_BOOLEAN", justification = "We want to explicitly check for boolean not-null and true")
public boolean isComplete() {
return done == Boolean.TRUE || super.isComplete(); // Compare to Boolean.TRUE so null == false.
return done || super.isComplete(); // Compare to Boolean.TRUE so null == false.
}

/**
Expand All @@ -1207,18 +1205,24 @@ synchronized void onProgramEnd(Outcome outcome) {
}

// shrink everything into a single new head
done = Boolean.TRUE;
if (heads != null) {
FlowHead first = getFirstHead();
first.setNewHead(head);
heads.clear();
heads.put(first.getId(), first);

String tempIotaStr = Integer.toString(this.iota.get());
FlowHead lastHead = heads.get(first.getId());
if (lastHead == null || lastHead.get() == null || !(lastHead.get().getId().equals(tempIotaStr))) {
LOGGER.log(Level.WARNING, "Invalid final head for execution "+this.owner+" with head: "+lastHead);
try {
if (heads != null) {
FlowHead first = getFirstHead();
first.setNewHead(head);
done = Boolean.TRUE; // After setting the final head
heads.clear();
heads.put(first.getId(), first);

String tempIotaStr = Integer.toString(this.iota.get());
FlowHead lastHead = heads.get(first.getId());
if (lastHead == null || lastHead.get() == null || !(lastHead.get().getId().equals(tempIotaStr))) {
// Warning of problems with the final call to FlowHead.setNewHead
LOGGER.log(Level.WARNING, "Invalid final head for execution "+this.owner+" with head: "+lastHead);
}
}
} catch (Exception ex) {
done = Boolean.TRUE;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

true

throw ex;
}

try {
Expand Down Expand Up @@ -1566,9 +1570,7 @@ public void marshal(Object source, HierarchicalStreamWriter w, MarshallingContex
for (BlockStartNode st : e.startNodes) {
writeChild(w, context, "start", st.getId(), String.class);
}
if (e.done != null) {
writeChild(w, context, "done", e.done, Boolean.class);
}
writeChild(w, context, "done", e.done, Boolean.class);
}
writeChild(w, context, "resumeBlocked", e.resumeBlocked, Boolean.class);

Expand Down Expand Up @@ -1818,6 +1820,13 @@ private void checkpoint() {
boolean persistOk = true;
FlowNodeStorage storage = getStorage();
if (storage != null) {
try { // Node storage must be flushed first so program can be restored
storage.flush();
} catch (IOException ioe) {
persistOk=false;
LOGGER.log(Level.WARNING, "Error persisting FlowNode storage before shutdown", ioe);
}

// Try to ensure we've saved the appropriate things -- the program is the last stumbling block.
try {
final SettableFuture<Void> myOutcome = SettableFuture.create();
Expand Down Expand Up @@ -1849,13 +1858,6 @@ public void onFailure(Throwable t) {
persistOk = false;
LOGGER.log(Level.FINE, "Error saving program, that should be handled elsewhere.", ex);
}

try {
storage.flush();
} catch (IOException ioe) {
persistOk=false;
LOGGER.log(Level.WARNING, "Error persisting FlowNode storage before shutdown", ioe);
}
persistedClean = persistOk;
saveOwner();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -534,9 +534,12 @@ synchronized boolean switchToAsyncMode() {
@Override public void onSuccess(CpsThreadGroup result) {
try {
// TODO keep track of whether the program was saved anyway after saveState was called but before now, and do not bother resaving it in that case
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW how does durability interact with saveState? Important.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jglick CpsStepContext#saveState() follows the same rules as overall Program durability - if your DurabilityHint has isPersistWithEveryStep() (i.e. MAX_SURVIVABILITY or SURVIVABLE_NONATOMIC) it saves normally.

Otherwise (for PERFORMANCE_OPTIMIZED) it doesn't -- your program is not providing guaranteed persistence in the face of catastrophic system failures.

Of course with a clean shutdown the pipeline will persist via the @Terminator as part of shutdown, so that's fine.

result.saveProgram();
if (result.getExecution().getDurabilityHint().isPersistWithEveryStep()) {
result.getExecution().getStorage().flush();
result.saveProgram();
}
f.set(null);
} catch (IOException x) {
} catch (Exception x) {
f.setException(x);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
import static org.jenkinsci.plugins.workflow.cps.CpsFlowExecution.*;
import static org.jenkinsci.plugins.workflow.cps.persistence.PersistenceContext.*;
import org.jenkinsci.plugins.workflow.pickles.PickleFactory;
import org.jenkinsci.plugins.workflow.support.storage.FlowNodeStorage;

/**
* List of {@link CpsThread}s that form a single {@link CpsFlowExecution}.
Expand Down Expand Up @@ -425,6 +426,16 @@ public CpsThreadDump getThreadDump() {
void saveProgramIfPossible(boolean enteringQuietState) {
if (this.getExecution() != null && (this.getExecution().getDurabilityHint().isPersistWithEveryStep()
|| enteringQuietState)) {

try { // Program may depend on flownodes being saved, so save nodes
FlowNodeStorage storage = this.execution.getStorage();
if (storage != null) {
storage.flush();
}
} catch (IOException ioe) {
LOGGER.log(Level.WARNING, "Error persisting FlowNode storage before saving program", ioe);
}

try {
saveProgram();
} catch (IOException x) {
Expand Down
Loading