Skip to content

Add partial result to AggregateCursor continuation #3254

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 55 commits into from
Jun 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
6c1ad10
save
pengpeng-lu Feb 24, 2025
855ae27
sync
pengpeng-lu Feb 26, 2025
5c2da3f
save
pengpeng-lu Feb 26, 2025
c62bffa
store null state
pengpeng-lu Feb 27, 2025
06d8e65
work except exhausted
pengpeng-lu Mar 3, 2025
0950385
save
pengpeng-lu Mar 10, 2025
b275ea6
pmd green
pengpeng-lu Mar 10, 2025
c953df8
add test for avg and bitmap
pengpeng-lu Mar 11, 2025
df09e54
save test
pengpeng-lu Mar 13, 2025
195b68c
green
pengpeng-lu Mar 17, 2025
97a3cdc
clean up
pengpeng-lu Mar 17, 2025
4810d22
remove dent
pengpeng-lu Mar 17, 2025
2948826
merge main
pengpeng-lu Mar 17, 2025
0d2d050
revert format change
pengpeng-lu Mar 17, 2025
9c4c50f
checkstyle
pengpeng-lu Mar 18, 2025
2eade46
save new test
pengpeng-lu Mar 18, 2025
e76fe89
intermediate
pengpeng-lu Mar 19, 2025
9540c23
add new plan proto and mode
pengpeng-lu Mar 24, 2025
89cdd6c
continuation mode
pengpeng-lu Mar 25, 2025
413d480
save
pengpeng-lu Mar 31, 2025
047de80
merge main
pengpeng-lu Mar 31, 2025
bb1bd6a
save, union test fail wrong result
pengpeng-lu Apr 6, 2025
255f19b
green
pengpeng-lu Apr 7, 2025
eaa0d78
all work
pengpeng-lu Apr 7, 2025
a1422b0
revert unnecessary changes
pengpeng-lu Apr 7, 2025
7e3187a
merge main
pengpeng-lu Apr 24, 2025
3de4fcc
fix test
pengpeng-lu Apr 25, 2025
b88775b
fix
pengpeng-lu Apr 25, 2025
8b9703c
change state to oneof type
pengpeng-lu Apr 25, 2025
14d157f
refactor
pengpeng-lu Apr 26, 2025
7abfe4b
more refactor
pengpeng-lu Apr 26, 2025
31588aa
remove isResultOnEmpty
pengpeng-lu Apr 26, 2025
6148359
refactor continuation 1
pengpeng-lu Apr 27, 2025
fe6dd12
refactor AggregateCursorContinuation
pengpeng-lu Apr 27, 2025
6c2ee73
save
pengpeng-lu Apr 28, 2025
2e59b19
save
pengpeng-lu Apr 29, 2025
830158c
remove lastResult
pengpeng-lu Apr 30, 2025
c91bc64
save
pengpeng-lu Apr 30, 2025
cd63dc5
refactor
pengpeng-lu Apr 30, 2025
5e05cdf
more test
pengpeng-lu May 5, 2025
39d92a0
more test
pengpeng-lu May 5, 2025
1add4ca
style
pengpeng-lu May 6, 2025
57bf49d
to_old
pengpeng-lu May 6, 2025
cfd3b88
style
pengpeng-lu May 6, 2025
e875d82
save
pengpeng-lu May 23, 2025
4cda3e8
add filterCursor
pengpeng-lu May 23, 2025
93ea4ec
add tests
pengpeng-lu May 27, 2025
e98d3ba
comments
pengpeng-lu May 27, 2025
f811e35
Merge branch 'main' into inner_cont
pengpeng-lu May 27, 2025
62d2e14
fix compile
pengpeng-lu May 27, 2025
d9a1798
spotbug
pengpeng-lu May 28, 2025
bb4525e
trigger build
pengpeng-lu May 28, 2025
6004807
add test to_old
pengpeng-lu Jun 1, 2025
be6887c
comments
pengpeng-lu Jun 2, 2025
f39e470
comments
pengpeng-lu Jun 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,19 @@

import com.apple.foundationdb.annotation.API;
import com.apple.foundationdb.async.AsyncUtil;
import com.apple.foundationdb.record.ByteArrayContinuation;
import com.apple.foundationdb.record.RecordCoreException;
import com.apple.foundationdb.record.RecordCursor;
import com.apple.foundationdb.record.RecordCursorContinuation;
import com.apple.foundationdb.record.RecordCursorProto;
import com.apple.foundationdb.record.RecordCursorResult;
import com.apple.foundationdb.record.RecordCursorVisitor;
import com.apple.foundationdb.record.query.plan.plans.QueryResult;
import com.apple.foundationdb.record.query.plan.plans.RecordQueryStreamingAggregationPlan;
import com.apple.foundationdb.tuple.ByteArrayUtil2;
import com.google.common.base.Verify;
import com.google.protobuf.ByteString;
import com.google.protobuf.InvalidProtocolBufferException;
import com.google.protobuf.Message;

import javax.annotation.Nonnull;
Expand Down Expand Up @@ -55,26 +62,37 @@ public class AggregateCursor<M extends Message> implements RecordCursor<QueryRes
// Previous non-empty record processed by this cursor
@Nullable
private RecordCursorResult<QueryResult> previousValidResult;
@Nonnull
private RecordCursorContinuation previousContinuationInGroup;
@Nullable
private RecordCursorProto.PartialAggregationResult partialAggregationResult;
@Nonnull
private final RecordQueryStreamingAggregationPlan.SerializationMode serializationMode;

public AggregateCursor(@Nonnull RecordCursor<QueryResult> inner,
@Nonnull final StreamGrouping<M> streamGrouping) {
@Nonnull final StreamGrouping<M> streamGrouping,
@Nonnull RecordCursorContinuation continuation,
@Nonnull final RecordQueryStreamingAggregationPlan.SerializationMode serializationMode) {
this.inner = inner;
this.streamGrouping = streamGrouping;
this.serializationMode = serializationMode;
this.previousContinuationInGroup = continuation;
}

@Nonnull
@Override
public CompletableFuture<RecordCursorResult<QueryResult>> onNext() {
if (previousResult != null && !previousResult.hasNext()) {
// we are done
return CompletableFuture.completedFuture(RecordCursorResult.exhausted());
return CompletableFuture.completedFuture(RecordCursorResult.withoutNextValue(new AggregateCursorContinuation(previousResult.getContinuation(), streamGrouping.getPartialAggregationResult(), serializationMode),
previousResult.getNoNextReason()));
}

return AsyncUtil.whileTrue(() -> inner.onNext().thenApply(innerResult -> {
previousResult = innerResult;
if (!innerResult.hasNext()) {
if (!isNoRecords()) {
streamGrouping.finalizeGroup();
partialAggregationResult = streamGrouping.finalizeGroup();
}
return false;
} else {
Expand All @@ -83,45 +101,73 @@ public CompletableFuture<RecordCursorResult<QueryResult>> onNext() {
if (!groupBreak) {
// previousValidResult is the last row before group break, it sets the continuation
previousValidResult = innerResult;
previousContinuationInGroup = previousValidResult.getContinuation();
}
return (!groupBreak);
}
}), getExecutor()).thenApply(vignore -> {
if (isNoRecords()) {
// Edge case where there are no records at all
return RecordCursorResult.exhausted();
// either innerResult.hasNext() = false; or groupBreak = true
if (Verify.verifyNotNull(previousResult).hasNext()) {
// in this case groupBreak = true, return aggregated result and continuation, partialAggregationResult = null
// previousValidResult = null happens when 1st row of current scan != last row of last scan, results in groupBreak = true and previousValidResult = null
RecordCursorContinuation c = new AggregateCursorContinuation(previousContinuationInGroup, serializationMode);

/*
* Update the previousValidResult to the next continuation even though it hasn't been returned. This is to return the correct continuation when there are single-element groups.
* Below is an example that shows how continuation(previousValidResult) moves:
* Initial: previousResult = null, previousValidResult = null
row0 groupKey0 groupBreak = False previousValidResult = row0 previousResult = row0
row1 groupKey0 groupBreak = False previousValidResult = row1 previousResult = row1
row2 groupKey1 groupBreak = True previousValidResult = row1 previousResult = row2
* returns result (groupKey0, continuation = row1), and set previousValidResult = row2
*
* Now there are 2 scenarios, 1) the current iteration continues; 2) the current iteration stops
* In scenario 1, the iteration continues, it gets to row3:
row3 groupKey2 groupBreak = True previousValidResult = row2 previousResult = row3
* returns result (groupKey1, continuation = row2), and set previousValidResult = row3
*
* In scenario 2, a new iteration starts from row2 (because the last returned continuation = row1), and set initial previousResult = null, previousValidResult = null:
row2 groupKey1 groupBreak = False previousValidResult = row2 previousResult = row2
* (Note that because a new iteration starts, groupBreak = False for row2.)
row3 groupKey2 groupBreak = True previousValidResult = row2 previousResult = row3
* returns result (groupKey1, continuation = row2), and set previousValidResult = row3
*
* Both scenarios returns the correct result, and continuation are both set to row3 in the end, row2 is scanned twice if a new iteration starts.
*/
previousValidResult = previousResult;
previousContinuationInGroup = Verify.verifyNotNull(previousValidResult).getContinuation();
return RecordCursorResult.withNextValue(QueryResult.ofComputed(streamGrouping.getCompletedGroupResult()), c);
} else {
// innerResult.hasNext() = false
if (Verify.verifyNotNull(previousResult).getNoNextReason() == NoNextReason.SOURCE_EXHAUSTED) {
// exhausted
if (previousValidResult == null && partialAggregationResult == null) {
return RecordCursorResult.exhausted();
} else {
RecordCursorContinuation c = new AggregateCursorContinuation(previousContinuationInGroup, serializationMode);
previousValidResult = previousResult;
previousContinuationInGroup = Verify.verifyNotNull(previousValidResult).getContinuation();
return RecordCursorResult.withNextValue(QueryResult.ofComputed(streamGrouping.getCompletedGroupResult()), c);
}
} else {
// stopped in the middle of a group
RecordCursorContinuation currentContinuation = new AggregateCursorContinuation(Verify.verifyNotNull(previousResult).getContinuation(), partialAggregationResult, serializationMode);
previousValidResult = previousResult;
previousContinuationInGroup = Verify.verifyNotNull(previousValidResult).getContinuation();
if (serializationMode == RecordQueryStreamingAggregationPlan.SerializationMode.TO_NEW) {
return RecordCursorResult.withoutNextValue(currentContinuation, Verify.verifyNotNull(previousResult).getNoNextReason());
} else {
// for TO_OLD, return {groupKey: partialResult from row x to y}, next time will return {groupKey: partialResult from y+1 to next stop point}
return RecordCursorResult.withNextValue(QueryResult.ofComputed(streamGrouping.getCompletedGroupResult()), currentContinuation);
}
}
}
// Use the last valid result for the continuation as we need non-terminal one here.
RecordCursorContinuation continuation = Verify.verifyNotNull(previousValidResult).getContinuation();
/*
* Update the previousValidResult to the next continuation even though it hasn't been returned. This is to return the correct continuation when there are single-element groups.
* Below is an example that shows how continuation(previousValidResult) moves:
* Initial: previousResult = null, previousValidResult = null
row0 groupKey0 groupBreak = False previousValidResult = row0 previousResult = row0
row1 groupKey0 groupBreak = False previousValidResult = row1 previousResult = row1
row2 groupKey1 groupBreak = True previousValidResult = row1 previousResult = row2
* returns result (groupKey0, continuation = row1), and set previousValidResult = row2
*
* Now there are 2 scenarios, 1) the current iteration continues; 2) the current iteration stops
* In scenario 1, the iteration continues, it gets to row3:
row3 groupKey2 groupBreak = True previousValidResult = row2 previousResult = row3
* returns result (groupKey1, continuation = row2), and set previousValidResult = row3
*
* In scenario 2, a new iteration starts from row2 (because the last returned continuation = row1), and set initial previousResult = null, previousValidResult = null:
row2 groupKey1 groupBreak = False previousValidResult = row2 previousResult = row2
* (Note that because a new iteration starts, groupBreak = False for row2.)
row3 groupKey2 groupBreak = True previousValidResult = row2 previousResult = row3
* returns result (groupKey1, continuation = row2), and set previousValidResult = row3
*
* Both scenarios returns the correct result, and continuation are both set to row3 in the end, row2 is scanned twice if a new iteration starts.
*/
previousValidResult = previousResult;
return RecordCursorResult.withNextValue(QueryResult.ofComputed(streamGrouping.getCompletedGroupResult()), continuation);
});
}



private boolean isNoRecords() {
return ((previousValidResult == null) && (!Verify.verifyNotNull(previousResult).hasNext()));
return ((previousValidResult == null) && (!Verify.verifyNotNull(previousResult).hasNext()) && (streamGrouping.getPartialAggregationResult() == null));
}

@Override
Expand All @@ -147,4 +193,84 @@ public boolean accept(@Nonnull RecordCursorVisitor visitor) {
}
return visitor.visitLeave(this);
}

public static class AggregateCursorContinuation implements RecordCursorContinuation {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A comment generally about the approach: this isn't quite the way that I thought this was going to go. My assumption had been that we were going to modify the continuation here so that we always return an AggregateCursorContinuation where the innerContinuation is the last continuation read from the inner, and the partialAggregationResult is pretty much always set--for results with a value, it would correspond to the partial result from reading the first value in the next group (to avoid errors like we saw with #3172). The idea there is that the continuation then more directly represents the state of the cursor, and I think there's ultimately less book-keeping (as the continuation can be derived purely from the continuation of the last result of the cursor and from the accumulator state). It also has the benefit that we'd only ever read each key once, rather than needing to re-read the first key of every group when resuming from the continuation of a result with a value.

I do think the alternative presented here (that is, the inner continuation is associated with the last value in a group that was returned and partialAggregationResult is null, unless execution stops in the middle of a group) should work. It's even forward compatible with an approach that did what I described above, and it may simplify the transition process given that we don't need to adjust the book-keeping so that it does different stuff across different versions. That is, with one exception: in the approach I outlined above, we need to be able to handle an innerContinuation which is an end continuation but that is wrapped by a non-end AggregateCursorContinuation. This is what we'd expect the continuation to be once we've exhausted the inner cursor, and we'd need to make sure that the plan constructs an exhausted cursor on that input.

It would be nice if we were a little forward looking and could handle a future continuation of that format. I could also see us wanting to transition to that approach in this PR now is I suppose debatable.

@Nonnull
private final RecordCursorContinuation innerContinuation;

@Nullable
private final RecordCursorProto.PartialAggregationResult partialAggregationResult;

@Nullable
private RecordCursorProto.AggregateCursorContinuation cachedProto;

private final RecordQueryStreamingAggregationPlan.SerializationMode serializationMode;

public AggregateCursorContinuation(@Nonnull RecordCursorContinuation innerContinuation, @Nullable RecordCursorProto.PartialAggregationResult partialAggregationResult, final RecordQueryStreamingAggregationPlan.SerializationMode serializationMode) {
this.innerContinuation = innerContinuation;
this.partialAggregationResult = partialAggregationResult;
this.serializationMode = serializationMode;
}

public AggregateCursorContinuation(@Nonnull RecordCursorContinuation other, RecordQueryStreamingAggregationPlan.SerializationMode serializationMode) {
this(other, null, serializationMode);
}

@Nonnull
@Override
public ByteString toByteString() {
if (serializationMode == RecordQueryStreamingAggregationPlan.SerializationMode.TO_OLD) {
return innerContinuation.toByteString();
} else {
return isEnd() ? ByteString.EMPTY : toProto().toByteString();
}
}

@Nullable
@Override
public byte[] toBytes() {
ByteString byteString = toByteString();
return byteString.isEmpty() ? null : byteString.toByteArray();
}

@Override
public boolean isEnd() {
return innerContinuation.isEnd();
}

@Nullable
public byte[] getInnerContinuation() {
return innerContinuation.toBytes();
}

@Nullable
public RecordCursorProto.PartialAggregationResult getPartialAggregationResult() {
return partialAggregationResult;
}

@Nonnull
private RecordCursorProto.AggregateCursorContinuation toProto() {
if (cachedProto == null) {
RecordCursorProto.AggregateCursorContinuation.Builder cachedProtoBuilder = RecordCursorProto.AggregateCursorContinuation.newBuilder().setContinuation(innerContinuation.toByteString());
if (partialAggregationResult != null) {
cachedProtoBuilder.setPartialAggregationResults(partialAggregationResult);
}
cachedProto = cachedProtoBuilder.build();
}
return cachedProto;
}

public static AggregateCursorContinuation fromRawBytes(@Nonnull byte[] rawBytes, RecordQueryStreamingAggregationPlan.SerializationMode serializationMode) {
if (serializationMode == RecordQueryStreamingAggregationPlan.SerializationMode.TO_OLD) {
return new AggregateCursorContinuation(ByteArrayContinuation.fromNullable(rawBytes), serializationMode);
}
try {
RecordCursorProto.AggregateCursorContinuation continuationProto = RecordCursorProto.AggregateCursorContinuation.parseFrom(rawBytes);
return new AggregateCursorContinuation(ByteArrayContinuation.fromNullable(continuationProto.getContinuation().toByteArray()), continuationProto.hasPartialAggregationResults() ? continuationProto.getPartialAggregationResults() : null, serializationMode);
} catch (InvalidProtocolBufferException ipbe) {
throw new RecordCoreException("error parsing continuation", ipbe)
.addLogInfo("raw_bytes", ByteArrayUtil2.loggable(rawBytes));
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,20 @@

import com.apple.foundationdb.record.Bindings;
import com.apple.foundationdb.record.EvaluationContext;
import com.apple.foundationdb.record.RecordCursorProto;
import com.apple.foundationdb.record.RecordCursorResult;
import com.apple.foundationdb.record.provider.foundationdb.FDBRecordStoreBase;
import com.apple.foundationdb.record.query.plan.cascades.CorrelationIdentifier;
import com.apple.foundationdb.record.query.plan.cascades.values.Accumulator;
import com.apple.foundationdb.record.query.plan.cascades.values.AggregateValue;
import com.apple.foundationdb.record.query.plan.cascades.values.Value;
import com.google.protobuf.DynamicMessage;
import com.google.protobuf.InvalidProtocolBufferException;
import com.google.protobuf.Message;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.util.List;
import java.util.Objects;

/**
Expand Down Expand Up @@ -104,10 +108,20 @@ public StreamGrouping(@Nullable final Value groupingKeyValue,
@Nonnull final CorrelationIdentifier aggregateAlias,
@Nonnull final FDBRecordStoreBase<M> store,
@Nonnull final EvaluationContext context,
@Nonnull final CorrelationIdentifier alias) {
@Nonnull final CorrelationIdentifier alias,
@Nullable final RecordCursorProto.PartialAggregationResult partialAggregationResult) {
this.groupingKeyValue = groupingKeyValue;
this.aggregateValue = aggregateValue;
this.accumulator = aggregateValue.createAccumulator(context.getTypeRepository());
if (partialAggregationResult == null) {
this.accumulator = aggregateValue.createAccumulatorWithInitialState(context.getTypeRepository(), null);
} else {
this.accumulator = aggregateValue.createAccumulatorWithInitialState(context.getTypeRepository(), partialAggregationResult.getAccumulatorStatesList());
try {
this.currentGroup = DynamicMessage.parseFrom(context.getTypeRepository().newMessageBuilder(groupingKeyValue.getResultType()).getDescriptorForType(), partialAggregationResult.getGroupKey().toByteArray());
} catch (InvalidProtocolBufferException e) {
throw new RuntimeException(e);
}
}
this.store = store;
this.context = context;
this.alias = alias;
Expand Down Expand Up @@ -167,20 +181,22 @@ private boolean isGroupBreak(final Object currentGroup, final Object nextGroup)
}
}

public void finalizeGroup() {
finalizeGroup(null);
public RecordCursorProto.PartialAggregationResult finalizeGroup() {
return finalizeGroup(null);
}

private void finalizeGroup(Object nextGroup) {
private RecordCursorProto.PartialAggregationResult finalizeGroup(Object nextGroup) {
final EvaluationContext nestedContext = context.childBuilder()
.setBinding(groupingKeyAlias, currentGroup)
.setBinding(aggregateAlias, accumulator.finish())
.build(context.getTypeRepository());
previousCompleteResult = completeResultValue.eval(store, nestedContext);

RecordCursorProto.PartialAggregationResult result = getPartialAggregationResult();
currentGroup = nextGroup;
// "Reset" the accumulator by creating a fresh one.
accumulator = aggregateValue.createAccumulator(context.getTypeRepository());
accumulator = aggregateValue.createAccumulatorWithInitialState(context.getTypeRepository(), null);
return result;
}

private void accumulate(@Nullable Object currentObject) {
Expand All @@ -193,4 +209,19 @@ private Object evalGroupingKey(@Nullable final Object currentObject) {
final EvaluationContext nestedContext = context.withBinding(Bindings.Internal.CORRELATION, alias, currentObject);
return Objects.requireNonNull(groupingKeyValue).eval(store, nestedContext);
}

@Nullable
public RecordCursorProto.PartialAggregationResult getPartialAggregationResult() {
if (currentGroup == null) {
return null;
}
List<RecordCursorProto.AccumulatorState> accumulatorStates = accumulator.getAccumulatorStates();
if (accumulatorStates.isEmpty()) {
return null;
}
return RecordCursorProto.PartialAggregationResult.newBuilder()
.setGroupKey(Objects.requireNonNull((Message)currentGroup).toByteString())
.addAllAccumulatorStates(accumulatorStates)
.build();
}
}
Loading