Skip to content
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
a7a926e
Enable persistence of the done field to provide protection against mi…
svanoort Mar 16, 2018
7b2ba86
Hardening and logging for abnormal cases with onLoad and incomplete p…
svanoort Mar 16, 2018
c028682
Findbugs fixes
svanoort Mar 16, 2018
046e27a
More logging and done flags
svanoort Mar 17, 2018
d33c90c
Testcases to verify final build state
svanoort Mar 17, 2018
1ad3eb4
Diagnostic logging when setNewHead called with null head
svanoort Mar 19, 2018
cf829bb
Downgrade one warning log to info
svanoort Mar 19, 2018
a017be9
Diagnostics fix
svanoort Mar 19, 2018
5d08569
[JENKINS-50407] Trying to diagnose an NPE.
jglick Mar 26, 2018
4c545d6
Mention issue number in exception for easy tracking.
jglick Mar 26, 2018
1f47c90
Merge branch 'master' into diag-JENKINS-50407
jglick Mar 27, 2018
b34e1f1
Consume latest WF-job snapshot and amend logging
svanoort Mar 30, 2018
ab88c6d
Merge remote-tracking branch 'jglick/diag-JENKINS-50407' into fix-jen…
svanoort Mar 30, 2018
f4aad61
More errorproofing in FlowHead
svanoort Mar 30, 2018
16dcb35
Apply the noted review changes
svanoort Apr 3, 2018
1a279ef
Bump wf-job version to version with more hardening
svanoort Apr 3, 2018
5aa15b5
More robustness enhancements from fuzzing and extend/enhance fuzzing …
svanoort Apr 4, 2018
b91d775
Fix durability mode setting in test
svanoort Apr 5, 2018
e69a24c
Additional robustness improvements, plus a bit more test coverage
svanoort Apr 5, 2018
9d8f0b1
Ensure that Program state always has its FlowNodes fully persisted be…
svanoort Apr 5, 2018
95d7ee2
Add additional testcase Javadocs
svanoort Apr 5, 2018
a9c52d6
Missing ignore for fuzzer test
svanoort Apr 5, 2018
a4110d2
Update test harness dep to latest snapshot
svanoort Apr 5, 2018
773855b
Reduce one fuzz test timing so it cannot time out
svanoort Apr 6, 2018
91212d2
Fix one error logging that missed the exception
svanoort Apr 6, 2018
5ecfd2c
Update to test harness
svanoort Apr 6, 2018
1360644
Consume final workflow-job released version
svanoort Apr 8, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
<workflow-support-plugin.version>2.17</workflow-support-plugin.version>
<scm-api-plugin.version>2.0.8</scm-api-plugin.version>
<groovy-cps.version>1.24</groovy-cps.version>
<jenkins-test-harness.version>2.33</jenkins-test-harness.version>
<jenkins-test-harness.version>2.37</jenkins-test-harness.version>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

file a request to update the parent POM

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

</properties>
<dependencies>
<dependency>
Expand Down Expand Up @@ -141,7 +141,7 @@
<dependency>
<groupId>org.jenkins-ci.plugins.workflow</groupId>
<artifactId>workflow-job</artifactId>
<version>2.17</version>
<version>2.18-20180406.172304-8</version>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please link to upstream PRs

<scope>test</scope>
</dependency>
<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@
import java.util.Set;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import javax.annotation.CheckForNull;
import javax.annotation.Nonnull;
import javax.annotation.concurrent.GuardedBy;
Expand Down Expand Up @@ -282,7 +284,8 @@ public class CpsFlowExecution extends FlowExecution implements BlockableResume {
private transient List<String> startNodesSerial; // used only between unmarshal and onLoad

@GuardedBy("this")
private /* almost final*/ NavigableMap<Integer,FlowHead> heads = new TreeMap<Integer,FlowHead>();
/* almost final*/ NavigableMap<Integer,FlowHead> heads = new TreeMap<Integer,FlowHead>(); // Non-private for unit tests

@SuppressFBWarnings({"IS_FIELD_NOT_GUARDED", "IS2_INCONSISTENT_SYNC"}) // irrelevant here
private transient Map<Integer,String> headsSerial; // used only between unmarshal and onLoad

Expand Down Expand Up @@ -313,7 +316,7 @@ public class CpsFlowExecution extends FlowExecution implements BlockableResume {
* {@link FlowExecution} gets loaded into memory for the build records that have been completed,
* and for those we don't want to load the program state, so that check should be efficient.
*/
private boolean done;
boolean done; // Only non-private for unit test use.

/**
* Groovy compiler with CPS+sandbox transformation correctly setup.
Expand Down Expand Up @@ -598,14 +601,35 @@ int approximateNodeCount() {
return iota.get();
}

/** For diagnostic purposes only, this logs current heads to assist with troubleshooting. */
private synchronized String getHeadsAsString() {
NavigableMap<Integer, FlowHead> myHeads = this.heads;
if (myHeads == null) {
return "null-heads";
} else if (myHeads.size() == 0) {
return "empty-heads";
} else {
return myHeads.entrySet().stream().map(h->h.getKey()+"::"+h.getValue()).collect(Collectors.joining(","));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

myHeads.toString not good enough??

}

}

/** Handle failures where we can't load heads. */
private void rebuildEmptyGraph() {
synchronized (this) {
// something went catastrophically wrong and there's no live head. fake one
LOGGER.log(Level.WARNING, "Failed to load pipeline heads, so faking some up for execution " + this.toString());
if (this.startNodes == null) {
this.startNodes = new Stack<BlockStartNode>();
}
this.heads.clear();

if (this.heads != null && this.heads.size() > 0) {
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.log(Level.INFO, "Resetting heads to rebuild the Pipeline structure, tossing existing heads: "+getHeadsAsString());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oooo, this is a nice addition.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here would reduce to just

if (heads != null && !heads.isEmpty()) {
    LOGGER.log(Level.INFO, "Resetting heads to rebuild the Pipeline structure, tossing existing heads: {0}", heads);
    heads.clear();
}

}
this.heads.clear();
}

this.startNodes.clear();
FlowHead head = new FlowHead(this);
heads.put(head.getId(), head);
Expand Down Expand Up @@ -637,6 +661,7 @@ protected synchronized void initializeStorage() throws IOException {
h.setForDeserialize(storage.getNode(entry.getValue()));
heads.put(h.getId(), h);
} else {
LOGGER.log(Level.WARNING, "Tried to load head FlowNodes for execution "+this.owner+" but FlowNode was not found in storage for head id:FlowNodeId "+entry.getKey()+":"+entry.getValue());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or just:

LOGGER.log(Level.WARNING, "Tried to load head FlowNodes for execution {0} but FlowNode was not found in storage for head id:FlowNodeId {1}", new Object[] {owner, entry});

storageErrors = true;
break;
}
Expand All @@ -654,6 +679,7 @@ protected synchronized void initializeStorage() throws IOException {
startNodes.add((BlockStartNode) storage.getNode(id));
} else {
// TODO if possible, consider trying to close out unterminated blocks using heads, to keep existing graph history
LOGGER.log(Level.WARNING, "Tried to load startNode FlowNodes for execution "+this.owner+" but FlowNode was not found in storage for FlowNode Id "+id);
storageErrors = true;
break;
}
Expand Down Expand Up @@ -687,6 +713,7 @@ public boolean canResume() {
}

@Override
@SuppressFBWarnings(value = "RC_REF_COMPARISON_BAD_PRACTICE_BOOLEAN", justification = "We want to explicitly check for boolean not-null and true")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it wrong that I love that with Boolean, you effectively can treat it as a, I dunno, troolean? What'd be the term for a boolean but for three possible states... =)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A trilean, acc. to Wikipedia.

public void onLoad(FlowExecutionOwner owner) throws IOException {
this.owner = owner;
try {
Expand All @@ -696,13 +723,15 @@ public void onLoad(FlowExecutionOwner owner) throws IOException {
if (canResume()) {
loadProgramAsync(getProgramDataFile());
} else {
// TODO if possible, consider tyring to close out unterminated blocks to keep existing graph history
// TODO if possible, consider trying to close out unterminated blocks to keep existing graph history
// That way we can visualize the graph in some error cases.
LOGGER.log(Level.WARNING, "Pipeline state not properly persisted, cannot resume "+owner.getUrl());
throw new IOException("Cannot resume build -- was not cleanly saved when Jenkins shut down.");
}
} else if (done && !super.isComplete()) {
LOGGER.log(Level.WARNING, "Completed flow without FlowEndNode: "+this+" heads:"+getHeadsAsString());
}
} catch (IOException e) {
} catch (Exception e) { // Multicatch ensures that failure to load does not nuke the master
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SettableFuture<CpsThreadGroup> p = SettableFuture.create();
programPromise = p;
loadProgramFailed(e, p);
Expand Down Expand Up @@ -829,6 +858,11 @@ void croak(Throwable t) {
setResult(Result.FAILURE);
onProgramEnd(new Outcome(null, t));
cleanUpHeap();
try {
saveOwner();
} catch (Exception ex) {
LOGGER.log(Level.WARNING, "Failed to persist WorkflowRun after noting a serious failure for run: " + owner, ex);
}
}

/**
Expand Down Expand Up @@ -1155,8 +1189,9 @@ public static void maybeAutoPersistNode(@Nonnull FlowNode node) {
}

@Override
@SuppressFBWarnings(value = "RC_REF_COMPARISON_BAD_PRACTICE_BOOLEAN", justification = "We want to explicitly check for boolean not-null and true")
public boolean isComplete() {
return done || super.isComplete();
return done || super.isComplete(); // Compare to Boolean.TRUE so null == false.
}

/**
Expand All @@ -1170,20 +1205,31 @@ synchronized void onProgramEnd(Outcome outcome) {
}

// shrink everything into a single new head
done = true;
if (heads != null) {
FlowHead first = getFirstHead();
first.setNewHead(head);
heads.clear();
heads.put(first.getId(),first);
try {
if (heads != null) {
FlowHead first = getFirstHead();
first.setNewHead(head);
done = Boolean.TRUE; // After setting the final head
heads.clear();
heads.put(first.getId(), first);

String tempIotaStr = Integer.toString(this.iota.get());
FlowHead lastHead = heads.get(first.getId());
if (lastHead == null || lastHead.get() == null || !(lastHead.get().getId().equals(tempIotaStr))) {
// Warning of problems with the final call to FlowHead.setNewHead
LOGGER.log(Level.WARNING, "Invalid final head for execution "+this.owner+" with head: "+lastHead);
}
}
} catch (Exception ex) {
done = Boolean.TRUE;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

true

throw ex;
}

try {
this.getStorage().flush();
} catch (IOException ioe) {
LOGGER.log(Level.WARNING, "Error flushing FlowNodeStorage to disk at end of run", ioe);
}

}

void cleanUpHeap() {
Expand Down Expand Up @@ -1431,6 +1477,7 @@ public void pause(final boolean v) throws IOException {
if (executable instanceof AccessControlled) {
((AccessControlled) executable).checkPermission(Item.CANCEL);
}
done = Boolean.FALSE;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto, could autobox

Futures.addCallback(programPromise, new FutureCallback<CpsThreadGroup>() {
@Override public void onSuccess(CpsThreadGroup g) {
if (v) {
Expand Down Expand Up @@ -1523,6 +1570,7 @@ public void marshal(Object source, HierarchicalStreamWriter w, MarshallingContex
for (BlockStartNode st : e.startNodes) {
writeChild(w, context, "start", st.getId(), String.class);
}
writeChild(w, context, "done", e.done, Boolean.class);
}
writeChild(w, context, "resumeBlocked", e.resumeBlocked, Boolean.class);

Expand Down Expand Up @@ -1587,6 +1635,9 @@ public Object unmarshal(HierarchicalStreamReader reader, final UnmarshallingCont
} else if (nodeName.equals("iota")) {
Integer iota = readChild(reader, context, Integer.class, result);
setField(result, "iota", new AtomicInteger(iota));
} else if (nodeName.equals("done")) {
Boolean isDone = readChild(reader, context, Boolean.class, result);
setField(result, "done", isDone);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So you are doing something with historical builds which omit this field, right? Is there some @LocalData test which fails if you comment out that “something”?

Copy link
Member Author

@svanoort svanoort Apr 5, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Historical builds have this field. Also I've now dropped the use of Boolean (wasn't needed, can use primitive).

} else if (nodeName.equals("start")) {
String id = readChild(reader, context, String.class, result);
result.startNodesSerial.add(id);
Expand Down Expand Up @@ -1750,7 +1801,7 @@ public void autopersist(@Nonnull FlowNode n) throws IOException {
/** Save the owner that holds this execution. */
void saveOwner() {
try {
if (this.owner.getExecutable() instanceof Saveable) {
if (this.owner != null && this.owner.getExecutable() instanceof Saveable) { // Null-check covers some anomalous cases we've seen
Saveable saveable = (Saveable)(this.owner.getExecutable());
saveable.save();
}
Expand All @@ -1769,6 +1820,13 @@ private void checkpoint() {
boolean persistOk = true;
FlowNodeStorage storage = getStorage();
if (storage != null) {
try { // Node storage must be flushed first so program can be restored
storage.flush();
} catch (IOException ioe) {
persistOk=false;
LOGGER.log(Level.WARNING, "Error persisting FlowNode storage before shutdown", ioe);
}

// Try to ensure we've saved the appropriate things -- the program is the last stumbling block.
try {
final SettableFuture<Void> myOutcome = SettableFuture.create();
Expand Down Expand Up @@ -1800,13 +1858,6 @@ public void onFailure(Throwable t) {
persistOk = false;
LOGGER.log(Level.FINE, "Error saving program, that should be handled elsewhere.", ex);
}

try {
storage.flush();
} catch (IOException ioe) {
persistOk=false;
LOGGER.log(Level.WARNING, "Error persisting FlowNode storage before shutdown", ioe);
}
persistedClean = persistOk;
saveOwner();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -534,9 +534,12 @@ synchronized boolean switchToAsyncMode() {
@Override public void onSuccess(CpsThreadGroup result) {
try {
// TODO keep track of whether the program was saved anyway after saveState was called but before now, and do not bother resaving it in that case
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW how does durability interact with saveState? Important.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jglick CpsStepContext#saveState() follows the same rules as overall Program durability - if your DurabilityHint has isPersistWithEveryStep() (i.e. MAX_SURVIVABILITY or SURVIVABLE_NONATOMIC) it saves normally.

Otherwise (for PERFORMANCE_OPTIMIZED) it doesn't -- your program is not providing guaranteed persistence in the face of catastrophic system failures.

Of course with a clean shutdown the pipeline will persist via the @Terminator as part of shutdown, so that's fine.

result.saveProgram();
if (result.getExecution().getDurabilityHint().isPersistWithEveryStep()) {
result.getExecution().getStorage().flush();
result.saveProgram();
}
f.set(null);
} catch (IOException x) {
} catch (Exception x) {
f.setException(x);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
import static org.jenkinsci.plugins.workflow.cps.CpsFlowExecution.*;
import static org.jenkinsci.plugins.workflow.cps.persistence.PersistenceContext.*;
import org.jenkinsci.plugins.workflow.pickles.PickleFactory;
import org.jenkinsci.plugins.workflow.support.storage.FlowNodeStorage;

/**
* List of {@link CpsThread}s that form a single {@link CpsFlowExecution}.
Expand Down Expand Up @@ -357,7 +358,6 @@ private boolean run() {
} else {
stillRunnable |= t.isRunnable();
}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

?

changed = true;
}
}
Expand Down Expand Up @@ -426,6 +426,16 @@ public CpsThreadDump getThreadDump() {
void saveProgramIfPossible(boolean enteringQuietState) {
if (this.getExecution() != null && (this.getExecution().getDurabilityHint().isPersistWithEveryStep()
|| enteringQuietState)) {

try { // Program may depend on flownodes being saved, so save nodes
FlowNodeStorage storage = this.execution.getStorage();
if (storage != null) {
storage.flush();
}
} catch (IOException ioe) {
LOGGER.log(Level.WARNING, "Error persisting FlowNode storage before saving program", ioe);
}

try {
saveProgram();
} catch (IOException x) {
Expand Down
39 changes: 25 additions & 14 deletions src/main/java/org/jenkinsci/plugins/workflow/cps/FlowHead.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
import org.jenkinsci.plugins.workflow.graph.FlowNode;
import org.jenkinsci.plugins.workflow.graph.FlowStartNode;

import javax.annotation.Nonnull;

/**
* Growing tip of the node graph.
*
Expand Down Expand Up @@ -114,29 +116,33 @@ void newStartNode(FlowStartNode n) throws IOException {
execution.storage.storeNode(head, false);
}

void setNewHead(FlowNode v) {
void setNewHead(@Nonnull FlowNode v) {
if (v == null) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So my brain briefly melts at the combination of @Nonnull and if (v == null) - I'm assuming this is defense-in-depth, with the @Nonnull covering compile-time scenarios and the if (v == null) covering even edgier-case scenarios?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, it's protecting against some things that shouldn't happen but yet were still happening in the wild, even though Findbugs didn't find issues at compile-time (probably it wasn't digging deeply enough).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I have no problem with that approach - probably worth adding a comment explaining that, though. It'd be awfully easy for a later maintainer (i.e., me, because I'm forgetful) to not know/remember that and just think "Oh, that's a pointless null check, lemme clear the dead code".

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uh, this is package-private so we can trivially enumerate all the call sites and put in assertions there that the value is non-null unless that is trivially seen from code inspection, right?

// Because Findbugs isn't 100% at catching cases where this can happen and we really need to fail hard-and-fast
throw new IllegalArgumentException("FlowHead.setNewHead called on FlowHead id="+this.id+" with a null FlowNode, execution="+this.execution);
}
try {
if (this.head != null) {
CpsFlowExecution.maybeAutoPersistNode(head);
}
execution.storage.storeNode(v, true);
v.addAction(new TimingAction());
this.head = v;
CpsThreadGroup c = CpsThreadGroup.current();
if (c !=null) {
// if the manipulation is from within the program executing thread, then
// defer the notification till we get to a safe point.
c.notifyNewHead(v);
} else {
// in recovering from error and such situation, we sometimes need to grow the graph
// without running the program.
// TODO can CpsThreadGroup.notifyNewHead be used instead to notify both kinds of listeners?
execution.notifyListeners(Collections.singletonList(v), true);
execution.notifyListeners(Collections.singletonList(v), false);
}
} catch (IOException e) {
LOGGER.log(Level.FINE, "Failed to record new head: " + v, e);
}
this.head = v;
CpsThreadGroup c = CpsThreadGroup.current();
if (c !=null) {
// if the manipulation is from within the program executing thread, then
// defer the notification till we get to a safe point.
c.notifyNewHead(v);
} else {
// in recovering from error and such situation, we sometimes need to grow the graph
// without running the program.
// TODO can CpsThreadGroup.notifyNewHead be used instead to notify both kinds of listeners?
execution.notifyListeners(Collections.singletonList(v), true);
execution.notifyListeners(Collections.singletonList(v), false);
}
}

FlowNode get() {
Expand Down Expand Up @@ -181,4 +187,9 @@ private Object readResolve() {

private static final Logger LOGGER = Logger.getLogger(FlowHead.class.getName());
private static final long serialVersionUID = 1L;

@Override
public String toString() {
return id+":"+head;
}
}
Loading