-
Notifications
You must be signed in to change notification settings - Fork 4.1k
logstore: funnel all raft log deletions through logstore package #169640
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
e17e79d
7e7ef2c
d6d5755
d43095f
f680d3e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -171,19 +171,32 @@ func destroyReplicaImpl( | |
| ); err != nil { | ||
| return err | ||
| } | ||
| // TODO(ibrahim): We could know `hi` if DestroyReplicaInfo passes down the | ||
| // log's last index. | ||
| if err := logstore.ClearRange( | ||
| ctx, rw.Raft.RO, rw.Raft.WO, buf.RaftLogPrefix(), | ||
| info.RaftAppliedIndex+1 /* lo */, math.MaxUint64 /* hi */, ClearRangeThresholdPointKeys(), | ||
| ); err != nil { | ||
| return err | ||
| } | ||
| } else { | ||
| if err := storage.ClearRangeWithHeuristic( | ||
| ctx, rw.Raft.RO, rw.Raft.WO, | ||
| buf.RaftLogKey(info.RaftAppliedIndex+1), sl.RaftReplicaIDKey(), | ||
| buf.RangeTombstoneKey().Next(), sl.RaftLogPrefix(), | ||
| ClearRangeThresholdPointKeys(), | ||
| ); err != nil { | ||
| return err | ||
| } | ||
| } else if err := storage.ClearRangeWithHeuristic( | ||
| ctx, rw.Raft.RO, rw.Raft.WO, | ||
| buf.RangeTombstoneKey().Next(), sl.RaftReplicaIDKey(), | ||
| ClearRangeThresholdPointKeys(), | ||
| ); err != nil { | ||
| return err | ||
| // Note: We could just clear the while raft log in the | ||
| // ClearRangeWithHeuristic() above. However, we want to funnel all raft log | ||
| // deletions through the logstore package to make it easier to reason about | ||
| // them. | ||
| if err := logstore.ClearRange( | ||
| ctx, rw.Raft.RO, rw.Raft.WO, buf.RaftLogPrefix(), | ||
| 0 /* lo */, math.MaxUint64 /* hi */, ClearRangeThresholdPointKeys(), | ||
| ); err != nil { | ||
| return err | ||
| } | ||
| } | ||
| if err := sl.ClearRaftReplicaID(rw.State.WO); err != nil { | ||
| return err | ||
|
|
@@ -265,10 +278,14 @@ func RewriteRaftState( | |
| if err := sl.SetHardState(ctx, raftWO, hs); err != nil { | ||
| return errors.Wrapf(err, "unable to write HardState") | ||
| } | ||
| // Clear the raft log. Note that there are no Pebble range keys in this span. | ||
| raftLog := sl.RaftLogPrefix() // NB: use only until next StateLoader call | ||
| if err := raftWO.ClearRawRange( | ||
| raftLog, raftLog.PrefixEnd(), true /* pointKeys */, false, /* rangeKeys */ | ||
| // Clear the raft log via the logstore. Note that there are no Pebble range | ||
| // keys in this span. We use ClearRangeSizeKnown with pointKeyThreshold=0 to | ||
| // force a single range tombstone over the whole log span, without scanning. | ||
| // TODO(ibrahim): We can actually know the log bounds using truncIndex and lastIndex. | ||
| if err := logstore.ClearRangeSizeKnown( | ||
| raftWO, sl.RaftLogPrefix(), | ||
| 0 /* lo */, math.MaxUint64 /* hi */, 0, /* pointKeyThreshold */ | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can probably make it a "size known" case. If we are applying a snapshot to an existing replica, it knows its log. Otherwise, the log is empty. Btw, I just realized we need more code here for separated engines case. We shouldn't be deleting the entire log - the applied part is deferred to the WAG truncator.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I want this PR to be as much of a mechanical refactor as possible. I left a TODO to address this in another PR |
||
| false, /* maybeUseSingleDel */ | ||
| ); err != nil { | ||
| return errors.Wrapf(err, "unable to clear the raft log") | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -271,25 +271,29 @@ func (t *WAGTruncator) maybeAdvanceAllowedIndex() { | |
| } | ||
| } | ||
|
|
||
| // clearReplicaRaftLogAndSideloaded clears raft log entries at or below the given index for | ||
| // a destroyed or subsumed replica, and it also deletes the sideloaded files associated with the | ||
| // deleted entries. | ||
| // clearReplicaRaftLogAndSideloaded clears raft log entries at or below the | ||
| // given index for a destroyed or subsumed replica, and it also deletes the | ||
| // sideloaded files associated with the deleted entries. | ||
| func (t *WAGTruncator) clearReplicaRaftLogAndSideloaded( | ||
| ctx context.Context, raft Raft, rangeID roachpb.RangeID, lastIndex kvpb.RaftIndex, | ||
| ) error { | ||
| if logstore.UseRaftLogSingleDelete(t.eng.Separated()) { | ||
| if err := clearRaftLogWithSingleDelete( | ||
| ctx, raft.RO, raft.WO, rangeID, lastIndex, | ||
| prefixBuf := keys.MakeRangeIDPrefixBuf(rangeID) | ||
| // We want to delete all raft log entries < hi. Since Raft log doesn't have | ||
| // holes, we can get the first index, calculate the log size, and call | ||
| // ClearRangeSizeKnown(). If no entries exist in [RaftLogPrefix, hi) (e.g., | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| // this replica never received entries) this operation is a no-op. | ||
| hi := lastIndex + 1 | ||
| lo, ok, err := firstRaftLogIndex(ctx, raft.RO, prefixBuf, hi) | ||
| if err != nil { | ||
| return errors.Wrapf(err, "finding first raft log index for r%d", rangeID) | ||
| } | ||
| if ok { | ||
| if err := logstore.ClearRangeSizeKnown( | ||
| raft.WO, prefixBuf.RaftLogPrefix(), lo, hi, ClearRangeThresholdPointKeys(), | ||
| logstore.UseRaftLogSingleDelete(t.eng.Separated()), | ||
| ); err != nil { | ||
| return errors.Wrapf(err, "clearing raft log entries for r%d", rangeID) | ||
| } | ||
| } else if err := storage.ClearRangeWithHeuristic( | ||
| ctx, raft.RO, raft.WO, | ||
| keys.RaftLogPrefix(rangeID), /* start */ | ||
| keys.RaftLogKey(rangeID, lastIndex+1), /* end */ | ||
| ClearRangeThresholdPointKeys(), | ||
| ); err != nil { | ||
| return errors.Wrapf(err, "clearing raft log entries for r%d", rangeID) | ||
| } | ||
|
|
||
| // In general, we shouldn't delete sideloaded files before committing the | ||
|
|
@@ -312,39 +316,38 @@ func (t *WAGTruncator) clearReplicaRaftLogAndSideloaded( | |
| return ss.Sync() | ||
| } | ||
|
|
||
| // clearRaftLogWithSingleDelete clears raft log entries using SingleDelete for | ||
| // each point key. Unlike the regular truncation path, this always uses point | ||
| // deletions and never falls back to a range tombstone for simplicity. | ||
| // TODO(ibrahim): Let this function use the same pointDelThreshold heuristic | ||
| // when clearning the raft log. | ||
| func clearRaftLogWithSingleDelete( | ||
| ctx context.Context, | ||
| r storage.Reader, | ||
| w storage.Writer, | ||
| rangeID roachpb.RangeID, | ||
| lastIndex kvpb.RaftIndex, | ||
| ) error { | ||
| start := keys.RaftLogPrefix(rangeID) | ||
| end := keys.RaftLogKey(rangeID, lastIndex+1) | ||
| // firstRaftLogIndex returns the smallest raft log index in [RaftLogPrefix, hi) | ||
| // that exists. | ||
| // Returns a boolean indicating whether the raft log index was found or not. If | ||
| // yes, it returns the index. | ||
| func firstRaftLogIndex( | ||
| ctx context.Context, r storage.Reader, prefixBuf keys.RangeIDPrefixBuf, hi kvpb.RaftIndex, | ||
| ) (kvpb.RaftIndex, bool, error) { | ||
| start := prefixBuf.RaftLogPrefix().Clone() | ||
| end := prefixBuf.RaftLogKey(hi).Clone() | ||
| iter, err := r.NewEngineIterator(ctx, storage.IterOptions{ | ||
| KeyTypes: storage.IterKeyTypePointsOnly, | ||
| LowerBound: start, | ||
| UpperBound: end, | ||
| }) | ||
| if err != nil { | ||
| return err | ||
| return 0, false, err | ||
| } | ||
| defer iter.Close() | ||
|
|
||
| ok, err := iter.SeekEngineKeyGE(storage.EngineKey{Key: start}) | ||
| for ; ok; ok, err = iter.NextEngineKey() { | ||
| key, kerr := iter.UnsafeEngineKey() | ||
| if kerr != nil { | ||
| return kerr | ||
| } | ||
| if err := w.SingleClearEngineKey(key); err != nil { | ||
| return err | ||
| } | ||
| if err != nil { | ||
| return 0, false, err | ||
| } | ||
| if !ok { | ||
| return 0, false, nil // no raft entry was found in the [0, hi) span. | ||
| } | ||
| key, err := iter.UnsafeEngineKey() | ||
| if err != nil { | ||
| return 0, false, err | ||
| } | ||
| lo, err := keys.DecodeRaftLogKeyFromSuffix(key.Key[len(start):]) | ||
| if err != nil { | ||
| return 0, false, err | ||
| } | ||
| return err | ||
| return lo, true, nil | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could make
hiknown here if theDestroyReplicaInfopasses down the log's last index. Dunno if we want to do it (it's "safer" to clear the whole suffix pretending we don't know where it ends), but it's a possibility.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
True. I want this PR to be as much of a mechanical refactor as possible. I left a TODO to address this in another PR