From 7e2405577376a7c93efad8aaa3c59efe3ae1ceae Mon Sep 17 00:00:00 2001 From: Annie Pompa Date: Tue, 14 Oct 2025 12:02:37 -0400 Subject: [PATCH] db: min size for mvcc separation should be configurable We will add a field, `MinimumMVCCGarbageSize`, that specifies the minimum value size required for likely MVCC garbage to be eligible for separation. Fixes: #5377 --- compaction.go | 9 ++-- data_test.go | 19 ++++++-- event_listener_test.go | 7 +-- internal/compact/run.go | 5 ++- metamorphic/options.go | 2 + metrics_test.go | 7 +-- options.go | 62 +++++++++++++++++++------- options_test.go | 16 +++---- replay/replay_test.go | 9 ++-- replay/testdata/replay_val_sep | 5 ++- sstable/colblk_writer.go | 24 +++++----- testdata/checkpoint | 2 +- testdata/compaction/backing_value_size | 2 +- testdata/compaction/mvcc_garbage_blob | 2 +- testdata/compaction/value_separation | 22 ++++----- testdata/compaction_picker_scores | 2 +- testdata/metrics | 8 ++-- testdata/static_span_policy_func | 42 ++++++++--------- testdata/table_stats | 2 +- value_separation.go | 36 ++++++++++----- 20 files changed, 172 insertions(+), 111 deletions(-) diff --git a/compaction.go b/compaction.go index de7e37b526..bc9ef0b89f 100644 --- a/compaction.go +++ b/compaction.go @@ -3476,14 +3476,15 @@ func (d *DB) compactAndWrite( writerOpts.Compression = block.FastestCompression } vSep := valueSeparation - switch spanPolicy.ValueStoragePolicy { - case ValueStorageLowReadLatency: + switch spanPolicy.ValueStoragePolicy.PolicyAdjustment { + case NoValueSeparation: vSep = compact.NeverSeparateValues{} - case ValueStorageLatencyTolerant: + case Override: // This span of keyspace is more tolerant of latency, so set a more // aggressive value separation policy for this output. vSep.SetNextOutputConfig(compact.ValueSeparationOutputConfig{ - MinimumSize: latencyTolerantMinimumSize, + MinimumSize: spanPolicy.ValueStoragePolicy.MinimumSize, + MinimumMVCCGarbageSize: spanPolicy.ValueStoragePolicy.MinimumMVCCGarbageSize, }) } objMeta, tw, err := d.newCompactionOutputTable(jobID, c, writerOpts) diff --git a/data_test.go b/data_test.go index 7d20cc97e8..ac579c2252 100644 --- a/data_test.go +++ b/data_test.go @@ -1783,21 +1783,27 @@ func parseDBOptionsArgs(opts *Options, args []datadriven.CmdArg) error { } policy := SpanPolicy{ DisableValueSeparationBySuffix: true, - ValueStoragePolicy: ValueStorageLowReadLatency, + ValueStoragePolicy: ValueStoragePolicy{ + PolicyAdjustment: NoValueSeparation, + }, } spanPolicies = append(spanPolicies, SpanAndPolicy{ KeyRange: span, Policy: policy, }) - case "latency-tolerant-span": + case "override-span": if len(cmdArg.Vals) != 2 { - return errors.New("latency-tolerant-span expects 2 arguments: ") + return errors.New("override-span expects 2 arguments: ") } span := KeyRange{ Start: []byte(cmdArg.Vals[0]), End: []byte(cmdArg.Vals[1]), } - policy := SpanPolicy{ValueStoragePolicy: ValueStorageLatencyTolerant} + policy := SpanPolicy{ValueStoragePolicy: ValueStoragePolicy{ + PolicyAdjustment: Override, + MinimumSize: latencyTolerantMinimumSize, + MinimumMVCCGarbageSize: 1, + }} spanPolicies = append(spanPolicies, SpanAndPolicy{ KeyRange: span, Policy: policy, @@ -1871,6 +1877,11 @@ func parseDBOptionsArgs(opts *Options, args []datadriven.CmdArg) error { if err != nil { return err } + case "min-mvcc-garbage-size": + policy.MinimumMVCCGarbageSize, err = strconv.Atoi(value) + if err != nil { + return err + } default: return errors.Newf("unrecognized value-separation argument %q", name) } diff --git a/event_listener_test.go b/event_listener_test.go index 8eeb19a705..3986a31b25 100644 --- a/event_listener_test.go +++ b/event_listener_test.go @@ -633,9 +633,10 @@ func TestBlobCorruptionEvent(t *testing.T) { } opts.Experimental.ValueSeparationPolicy = func() ValueSeparationPolicy { return ValueSeparationPolicy{ - Enabled: true, - MinimumSize: 1, - MaxBlobReferenceDepth: 10, + Enabled: true, + MinimumSize: 1, + MinimumMVCCGarbageSize: 1, + MaxBlobReferenceDepth: 10, } } d, err := Open("", opts) diff --git a/internal/compact/run.go b/internal/compact/run.go index 0012717dfc..4169dc081d 100644 --- a/internal/compact/run.go +++ b/internal/compact/run.go @@ -115,6 +115,9 @@ type ValueSeparationOutputConfig struct { // MinimumSize is the minimum size of a value that will be separated into a // blob file. MinimumSize int + // MinimumMVCCGarbageSize is the minimum size of a value that will be + // separated into a blob file if the value is likely MVCC garbage. + MinimumMVCCGarbageSize int } // ValueSeparation defines an interface for writing some values to separate blob @@ -344,7 +347,7 @@ func (r *Runner) writeKeysToTable( } valueLen := kv.V.Len() - isLikelyMVCCGarbage := sstable.IsLikelyMVCCGarbage(kv.K.UserKey, prevKeyKind, kv.K.Kind(), valueLen, prefixEqual) + isLikelyMVCCGarbage := sstable.IsLikelyMVCCGarbage(kv.K.UserKey, prevKeyKind, kv.K.Kind(), prefixEqual) // Add the value to the sstable, possibly separating its value into a // blob file. The ValueSeparation implementation is responsible for // writing the KV to the sstable. diff --git a/metamorphic/options.go b/metamorphic/options.go index ae1f6ab2f7..45547e4688 100644 --- a/metamorphic/options.go +++ b/metamorphic/options.go @@ -341,6 +341,7 @@ func defaultOptions(kf KeyFormat) *pebble.Options { return pebble.ValueSeparationPolicy{ Enabled: true, MinimumSize: 5, + MinimumMVCCGarbageSize: 1, MaxBlobReferenceDepth: 3, RewriteMinimumAge: 50 * time.Millisecond, GarbageRatioLowPriority: 0.10, // 10% garbage @@ -926,6 +927,7 @@ func RandomOptions(rng *rand.Rand, kf KeyFormat, cfg RandomOptionsCfg) *TestOpti policy := pebble.ValueSeparationPolicy{ Enabled: true, MinimumSize: 1 + rng.IntN(maxValueSize), + MinimumMVCCGarbageSize: 1 + rng.IntN(10), // [1, 11) MaxBlobReferenceDepth: 2 + rng.IntN(9), // 2-10 RewriteMinimumAge: time.Duration(rng.IntN(90)+10) * time.Millisecond, // [10ms, 100ms) GarbageRatioLowPriority: lowPri, diff --git a/metrics_test.go b/metrics_test.go index de4e7f2f4c..c15043bc66 100644 --- a/metrics_test.go +++ b/metrics_test.go @@ -254,9 +254,10 @@ func TestMetrics(t *testing.T) { opts.Experimental.EnableValueBlocks = func() bool { return true } opts.Experimental.ValueSeparationPolicy = func() ValueSeparationPolicy { return ValueSeparationPolicy{ - Enabled: true, - MinimumSize: 3, - MaxBlobReferenceDepth: 5, + Enabled: true, + MinimumSize: 3, + MinimumMVCCGarbageSize: 1, + MaxBlobReferenceDepth: 5, } } opts.TargetFileSizes[0] = 50 diff --git a/options.go b/options.go index e15b8be76a..30708e0558 100644 --- a/options.go +++ b/options.go @@ -1238,6 +1238,13 @@ type ValueSeparationPolicy struct { // // MinimumSize must be > 0. MinimumSize int + // MinimumMVCCGarbageSize specifies the minimum size of a value that can be + // separated into a blob file if said value is likely MVCC garbage. This + // applies only to SpanPolicies that permit separation of MVCC garbage, + // which is also the default. + // + // MinimumMVCCGarbageSize must be > 0. + MinimumMVCCGarbageSize int // MaxBlobReferenceDepth limits the number of potentially overlapping (in // the keyspace) blob files that can be referenced by a single sstable. If a // compaction may produce an output sstable referencing more than this many @@ -1311,36 +1318,51 @@ func (p SpanPolicy) String() string { if p.DisableValueSeparationBySuffix { sb.WriteString("disable-value-separation-by-suffix,") } - switch p.ValueStoragePolicy { - case ValueStorageLowReadLatency: - sb.WriteString("low-read-latency,") - case ValueStorageLatencyTolerant: - sb.WriteString("latency-tolerant,") + switch p.ValueStoragePolicy.PolicyAdjustment { + case NoValueSeparation: + sb.WriteString("no-value-separation,") + case Override: + sb.WriteString("override,") } return strings.TrimSuffix(sb.String(), ",") } -// ValueStoragePolicy is a hint used to determine where to store the values for -// KVs. -type ValueStoragePolicy uint8 +// ValueStoragePolicy is used to determine where to store the values for +// KVs. If the PolicyAdjustment specified is Override, the remaining fields +// are used to override the global configuration for value separation. +type ValueStoragePolicy struct { + // PolicyAdjustment specifies the policy adjustment to apply. + PolicyAdjustment ValueStoragePolicyAdjustment + // Remaining fields are ignored, unless the PolicyAdjustment is Override. + + // MinimumSize is the minimum size of the value. + MinimumSize int + // MinimumMVCCGarbageSize is the minimum size of the value that is likely + // MVCC garbage. + MinimumMVCCGarbageSize int +} + +// ValueStoragePolicyAdjustment is a hint used to determine where to store the +// values for KVs. +type ValueStoragePolicyAdjustment uint8 const ( - // ValueStorageDefault is the default value; Pebble will respect global - // configuration for value blocks and value separation. - ValueStorageDefault ValueStoragePolicy = iota + // UseDefault is the default value; Pebble will respect global + // configuration for value separation. + UseDefault ValueStoragePolicyAdjustment = iota - // ValueStorageLowReadLatency indicates Pebble should prefer storing values + // NoValueSeparation indicates Pebble should prefer storing values // in-place. - ValueStorageLowReadLatency + NoValueSeparation - // ValueStorageLatencyTolerant indicates value retrieval can tolerate + // Override indicates value retrieval can tolerate // additional latency, so Pebble should aggressively prefer storing values // separately if it can reduce write amplification. // // If the global Options' enable value separation, Pebble may choose to - // separate values under the LatencyTolerant policy even if they do not meet + // separate values under the Override policy even if they do not meet // the minimum size threshold of the global Options' ValueSeparationPolicy. - ValueStorageLatencyTolerant + Override ) // SpanPolicyFunc is used to determine the SpanPolicy for a key region. @@ -1855,6 +1877,7 @@ func (o *Options) String() string { fmt.Fprintln(&buf, "[Value Separation]") fmt.Fprintf(&buf, " enabled=%t\n", policy.Enabled) fmt.Fprintf(&buf, " minimum_size=%d\n", policy.MinimumSize) + fmt.Fprintf(&buf, " minimum_mvcc_garbage_size=%d\n", policy.MinimumMVCCGarbageSize) fmt.Fprintf(&buf, " max_blob_reference_depth=%d\n", policy.MaxBlobReferenceDepth) fmt.Fprintf(&buf, " rewrite_minimum_age=%s\n", policy.RewriteMinimumAge) fmt.Fprintf(&buf, " garbage_ratio_low_priority=%.2f\n", policy.GarbageRatioLowPriority) @@ -2300,6 +2323,10 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error { var minimumSize int minimumSize, err = strconv.Atoi(value) valSepPolicy.MinimumSize = minimumSize + case "minimum_mvcc_garbage_size": + var minimumMVCCGarbageSize int + minimumMVCCGarbageSize, err = strconv.Atoi(value) + valSepPolicy.MinimumMVCCGarbageSize = minimumMVCCGarbageSize case "max_blob_reference_depth": valSepPolicy.MaxBlobReferenceDepth, err = strconv.Atoi(value) case "rewrite_minimum_age": @@ -2571,6 +2598,9 @@ func (o *Options) Validate() error { if policy.MinimumSize <= 0 { fmt.Fprintf(&buf, "ValueSeparationPolicy.MinimumSize (%d) must be > 0\n", policy.MinimumSize) } + if policy.MinimumMVCCGarbageSize <= 0 { + fmt.Fprintf(&buf, "ValueSeparationPolicy.MinimumMVCCGarbageSize (%d) must be > 0\n", policy.MinimumMVCCGarbageSize) + } if policy.MaxBlobReferenceDepth <= 0 { fmt.Fprintf(&buf, "ValueSeparationPolicy.MaxBlobReferenceDepth (%d) must be > 0\n", policy.MaxBlobReferenceDepth) } diff --git a/options_test.go b/options_test.go index ce66eb353d..5d603b366a 100644 --- a/options_test.go +++ b/options_test.go @@ -42,9 +42,10 @@ func (o *Options) randomizeForTesting(t testing.TB) { if o.FormatMajorVersion >= FormatValueSeparation && o.Experimental.ValueSeparationPolicy == nil && rand.Int64N(4) > 0 { lowPri := 0.1 + rand.Float64()*0.9 // [0.1, 1.0) policy := ValueSeparationPolicy{ - Enabled: true, - MinimumSize: 1 << rand.IntN(10), // [1, 512] - MaxBlobReferenceDepth: rand.IntN(10) + 1, // [1, 10) + Enabled: true, + MinimumSize: 1 << rand.IntN(10), // [1, 512] + MinimumMVCCGarbageSize: 1 + rand.IntN(10), // [1, 10] + MaxBlobReferenceDepth: 1 + rand.IntN(10), // [1, 10] // Constrain the rewrite minimum age to [0, 15s). RewriteMinimumAge: time.Duration(rand.IntN(15)) * time.Second, GarbageRatioLowPriority: lowPri, @@ -259,7 +260,6 @@ func TestOptionsCheckCompatibility(t *testing.T) { // Check that an OPTIONS file that configured an explicit WALDir that will // no longer be used errors if it's not also present in WALRecoveryDirs. - //require.Equal(t, ErrMissingWALRecoveryDir{Dir: "external-wal-dir"}, err := DefaultOptions().CheckCompatibility(storeDir, ` [Options] wal_dir=external-wal-dir @@ -648,10 +648,10 @@ func TestStaticSpanPolicyFunc(t *testing.T) { sap.KeyRange.End = []byte(p.Next()) p.Expect(":") switch tok := p.Next(); tok { - case "lowlatency": - sap.Policy.ValueStoragePolicy = ValueStorageLowReadLatency - case "latencytolerant": - sap.Policy.ValueStoragePolicy = ValueStorageLatencyTolerant + case "novalueseparation": + sap.Policy.ValueStoragePolicy.PolicyAdjustment = NoValueSeparation + case "override": + sap.Policy.ValueStoragePolicy.PolicyAdjustment = Override default: t.Fatalf("unknown policy: %s", tok) } diff --git a/replay/replay_test.go b/replay/replay_test.go index 4092ab15ae..25ac03947f 100644 --- a/replay/replay_test.go +++ b/replay/replay_test.go @@ -350,10 +350,11 @@ func collectCorpus(t *testing.T, fs *vfs.MemFS, name string) { } opts.Experimental.ValueSeparationPolicy = func() pebble.ValueSeparationPolicy { return pebble.ValueSeparationPolicy{ - Enabled: true, - MinimumSize: 3, - MaxBlobReferenceDepth: 5, - RewriteMinimumAge: 15 * time.Minute, + Enabled: true, + MinimumSize: 3, + MinimumMVCCGarbageSize: 1, + MaxBlobReferenceDepth: 5, + RewriteMinimumAge: 15 * time.Minute, } } setDefaultExperimentalOpts(opts) diff --git a/replay/testdata/replay_val_sep b/replay/testdata/replay_val_sep index 6d9326b9d5..24044c28b3 100644 --- a/replay/testdata/replay_val_sep +++ b/replay/testdata/replay_val_sep @@ -17,7 +17,7 @@ tree 0 LOCK 152 MANIFEST-000010 250 MANIFEST-000013 - 2947 OPTIONS-000003 + 2977 OPTIONS-000003 0 marker.format-version.000011.024 0 marker.manifest.000003.MANIFEST-000013 simple_val_sep/ @@ -32,7 +32,7 @@ tree 11 000011.log 707 000012.sst 187 MANIFEST-000013 - 2947 OPTIONS-000003 + 2977 OPTIONS-000003 0 marker.format-version.000001.024 0 marker.manifest.000001.MANIFEST-000013 @@ -93,6 +93,7 @@ cat build/OPTIONS-000003 [Value Separation] enabled=true minimum_size=3 + minimum_mvcc_garbage_size=1 max_blob_reference_depth=5 rewrite_minimum_age=15m0s garbage_ratio_low_priority=0.00 diff --git a/sstable/colblk_writer.go b/sstable/colblk_writer.go index ba8cc6f608..ec0609b136 100644 --- a/sstable/colblk_writer.go +++ b/sstable/colblk_writer.go @@ -643,9 +643,17 @@ func (w *RawColumnWriter) evaluatePoint( // We require: // . Value blocks to be enabled. // . IsLikelyMVCCGarbage to be true; see comment for MVCC garbage criteria. + // . The value to be sufficiently large. (Currently we simply require a + // non-zero length, so all non-empty values are eligible for storage + // out-of-band in a value block.) + // + // Use of 0 here is somewhat arbitrary. Given the minimum 3 byte encoding of + // valueHandle, this should be > 3. But tiny values are common in test and + // unlikely in production, so we use 0 here for better test coverage. useValueBlock := !w.opts.DisableValueBlocks && w.valueBlock != nil && - IsLikelyMVCCGarbage(key.UserKey, prevKeyKind, keyKind, valueLen, prefixEqual) + valueLen > 0 && + IsLikelyMVCCGarbage(key.UserKey, prevKeyKind, keyKind, prefixEqual) if !useValueBlock { return eval, nil } @@ -1285,26 +1293,14 @@ func (w *RawColumnWriter) SetValueSeparationProps(kind ValueSeparationKind, minV // // . The previous key to be a SET/SETWITHDEL. // . The current key to be a SET/SETWITHDEL. -// . The value to be sufficiently large. (Currently we simply require a -// non-zero length, so all non-empty values are eligible for storage -// out-of-band in a value block.) // . The current key to have the same prefix as the previous key. -// -// Use of 0 here is somewhat arbitrary. Given the minimum 3 byte encoding of -// valueHandle, this should be > 3. But tiny values are common in test and -// unlikely in production, so we use 0 here for better test coverage. func IsLikelyMVCCGarbage( - k []byte, - prevKeyKind, keyKind base.InternalKeyKind, - valueLen int, - prefixEqual func(k []byte) bool, + k []byte, prevKeyKind, keyKind base.InternalKeyKind, prefixEqual func(k []byte) bool, ) bool { - const tinyValueThreshold = 0 isSetStarKind := func(k base.InternalKeyKind) bool { return k == InternalKeyKindSet || k == InternalKeyKindSetWithDelete } return isSetStarKind(prevKeyKind) && isSetStarKind(keyKind) && - valueLen > tinyValueThreshold && prefixEqual(k) } diff --git a/testdata/checkpoint b/testdata/checkpoint index 3142baa34a..9f4de302b1 100644 --- a/testdata/checkpoint +++ b/testdata/checkpoint @@ -895,7 +895,7 @@ L6: # Create a new database with value separation enabled to test that blob files # are included in checkpoints. -open valsepdb value-separation=(enabled, min-size=1, max-ref-depth=5, rw-min-age=0s, garbage-ratios=1.0:1.0) +open valsepdb value-separation=(enabled, min-size=1, max-ref-depth=5, rw-min-age=0s, garbage-ratios=1.0:1.0, min-mvcc-garbage-size=1) ---- mkdir-all: valsepdb 0755 open-dir: . diff --git a/testdata/compaction/backing_value_size b/testdata/compaction/backing_value_size index d28bb1adf9..2fe8b72b43 100644 --- a/testdata/compaction/backing_value_size +++ b/testdata/compaction/backing_value_size @@ -1,7 +1,7 @@ # Test a blob file rewrite compaction with virtual sstable references that # track backing value sizes do not trigger unnecessary blob file rewrites. -define value-separation=(enabled=true, min-size=1, max-ref-depth=10, garbage-ratios=0.01:0.2) +define value-separation=(enabled=true, min-size=1, max-ref-depth=10, garbage-ratios=0.01:0.2, min-mvcc-garbage-size=1) ---- batch diff --git a/testdata/compaction/mvcc_garbage_blob b/testdata/compaction/mvcc_garbage_blob index f83532550f..f6f5360b20 100644 --- a/testdata/compaction/mvcc_garbage_blob +++ b/testdata/compaction/mvcc_garbage_blob @@ -1,6 +1,6 @@ # Set the minimum size for a separated value to 5. -define value-separation=(enabled, min-size=5, max-ref-depth=3, garbage-ratios=1.0:1.0) +define value-separation=(enabled, min-size=5, max-ref-depth=3, garbage-ratios=1.0:1.0, min-mvcc-garbage-size=1) ---- batch diff --git a/testdata/compaction/value_separation b/testdata/compaction/value_separation index c889ea939c..7e176b254b 100644 --- a/testdata/compaction/value_separation +++ b/testdata/compaction/value_separation @@ -1,7 +1,7 @@ # Test a simple sequence of flushes and compactions where all values are # separated. -define value-separation=(enabled, min-size=1, max-ref-depth=3, rw-min-age=0, garbage-ratios=1.0:1.0) +define value-separation=(enabled, min-size=1, max-ref-depth=3, rw-min-age=0, garbage-ratios=1.0:1.0, min-mvcc-garbage-size=1) ---- batch @@ -81,7 +81,7 @@ compact a-d # and the value separation policy is configured to limit the blob reference # depth to 3. -define verbose value-separation=(enabled, min-size=3, max-ref-depth=3, rw-min-age=0s, garbage-ratios=1.0:1.0) +define verbose value-separation=(enabled, min-size=3, max-ref-depth=3, rw-min-age=0s, garbage-ratios=1.0:1.0, min-mvcc-garbage-size=1) L6 blob-depth=3 a.SET.0:a b.SET.0:blob{fileNum=100002 value=bar} @@ -223,7 +223,7 @@ COMPRESSION # Set the minimum size for a separated value to 5. -define value-separation=(enabled, min-size=5, max-ref-depth=3, rw-min-age=0s, garbage-ratios=1.0:1.0) +define value-separation=(enabled, min-size=5, max-ref-depth=3, rw-min-age=0s, garbage-ratios=1.0:1.0, min-mvcc-garbage-size=1) ---- batch @@ -273,7 +273,7 @@ w:world # Configure the database to require keys in the range [a,m) to be in-place. -define required-in-place=(a,m) value-separation=(enabled, min-size=1, max-ref-depth=3, rw-min-age=0s, garbage-ratios=1.0:1.0) +define required-in-place=(a,m) value-separation=(enabled, min-size=1, max-ref-depth=3, rw-min-age=0s, garbage-ratios=1.0:1.0, min-mvcc-garbage-size=1) ---- batch @@ -300,7 +300,7 @@ Blob files: # references. Because these files overlap and are in separate sublevels, a # compaction that preserves blob references should sum their depths. -define value-separation=(enabled, min-size=1, max-ref-depth=5, rw-min-age=0s, garbage-ratios=1.0:1.0) l0-compaction-threshold=2 +define value-separation=(enabled, min-size=1, max-ref-depth=5, rw-min-age=0s, garbage-ratios=1.0:1.0, min-mvcc-garbage-size=1) l0-compaction-threshold=2 L0 blob-depth=1 a.SET.9:a d.SET.9:blob{fileNum=100001 value=d} @@ -362,7 +362,7 @@ obsolete-key: hex:00 # sublevel, a compaction that preserves blob references should take the MAX of # their depths. -define value-separation=(enabled, min-size=1, max-ref-depth=5, rw-min-age=0s, garbage-ratios=1.0:1.0) l0-compaction-threshold=2 +define value-separation=(enabled, min-size=1, max-ref-depth=5, rw-min-age=0s, garbage-ratios=1.0:1.0, min-mvcc-garbage-size=1) l0-compaction-threshold=2 L0 blob-depth=1 a.SET.9:a d.SET.9:blob{fileNum=100001 value=d} @@ -397,7 +397,7 @@ Blob files: # garbage ratio of 0.0 (no garbage). With this configuration, any blob file that # contains any unreferenced values should be immediately compacted. -define value-separation=(enabled, min-size=1, max-ref-depth=2, rw-min-age=0s, garbage-ratios=0.0:0.0) auto-compactions=off +define value-separation=(enabled, min-size=1, max-ref-depth=2, rw-min-age=0s, garbage-ratios=0.0:0.0, min-mvcc-garbage-size=1) auto-compactions=off ---- batch @@ -526,7 +526,7 @@ COMPRESSION # Test a blob file rewrite compaction with virtual sstable references. -define value-separation=(enabled, min-size=1, max-ref-depth=10, rw-min-age=0s, garbage-ratios=0.01:0.01) +define value-separation=(enabled, min-size=1, max-ref-depth=10, rw-min-age=0s, garbage-ratios=0.01:0.01, min-mvcc-garbage-size=1) ---- batch @@ -564,7 +564,7 @@ validate-blob-reference-index-block ---- validated -define value-separation=(enabled, min-size=5, max-ref-depth=5, rw-min-age=0s, garbage-ratios=1.0:1.0) l0-compaction-threshold=1 target-file-sizes=65536 memtable-size=10000000 +define value-separation=(enabled, min-size=5, max-ref-depth=5, rw-min-age=0s, garbage-ratios=1.0:1.0, min-mvcc-garbage-size=1) l0-compaction-threshold=1 target-file-sizes=65536 memtable-size=10000000 ---- # Test writing a non-trivial amount of data. With a key length of 4, we'll write @@ -685,10 +685,10 @@ would excise 1 files. # Test a value separation policy that is configured to only separate values ≥ -# 1024 bytes, but there's also a key span defined with the latency-tolerant +# 1024 bytes, but there's also a key span defined with the overrride # value storage policy. -define value-separation=(enabled, min-size=1024, max-ref-depth=10, rw-min-age=0s, garbage-ratios=1.0:1.0) latency-tolerant-span=(f,o) +define value-separation=(enabled, min-size=1024, max-ref-depth=10, rw-min-age=0s, garbage-ratios=1.0:1.0, min-mvcc-garbage-size=1) override-span=(f,o) ---- batch diff --git a/testdata/compaction_picker_scores b/testdata/compaction_picker_scores index 77864a2f3c..5a9006fd3f 100644 --- a/testdata/compaction_picker_scores +++ b/testdata/compaction_picker_scores @@ -271,7 +271,7 @@ L6 898KB 0.00 0.97 0.97 # it weren't for the high priority blob file rewrite compaction heuristic # triggering, we would pursue an automatic score compaction from L0 -> LBase. -define l0-compaction-threshold=1 lbase-max-bytes=65536 enable-table-stats=true auto-compactions=off value-separation=(enabled, min-size=1, max-ref-depth=5, garbage-ratios=0.1:0.2) +define l0-compaction-threshold=1 lbase-max-bytes=65536 enable-table-stats=true auto-compactions=off value-separation=(enabled, min-size=1, max-ref-depth=5, garbage-ratios=0.1:0.2, min-mvcc-garbage-size=1) ---- batch diff --git a/testdata/metrics b/testdata/metrics index df1723b6c3..549998b902 100644 --- a/testdata/metrics +++ b/testdata/metrics @@ -184,7 +184,7 @@ Iter category stats: disk-usage ---- -3,709B +3,739B batch set b 2 @@ -287,7 +287,7 @@ Iter category stats: disk-usage ---- -6,002B +6,032B # Closing iter a will release one of the zombie memtables. @@ -454,7 +454,7 @@ Iter category stats: disk-usage ---- -5,295B +5,325B # Closing iter b will release the last zombie sstable and the last zombie memtable. @@ -539,7 +539,7 @@ Iter category stats: disk-usage ---- -4,588B +4,618B additional-metrics ---- diff --git a/testdata/static_span_policy_func b/testdata/static_span_policy_func index 4ff4bf3621..d76a1db3b0 100644 --- a/testdata/static_span_policy_func +++ b/testdata/static_span_policy_func @@ -1,59 +1,59 @@ # A single policy. -test d-f:lowlatency +test d-f:novalueseparation a b c d e f g ---- a -> until d b -> until d c -> until d -d -> low-read-latency until f -e -> low-read-latency until f +d -> no-value-separation until f +e -> no-value-separation until f f -> none g -> none # A single encompassing policy. -test a-z:lowlatency +test a-z:novalueseparation a b c d e z ---- -a -> low-read-latency until z -b -> low-read-latency until z -c -> low-read-latency until z -d -> low-read-latency until z -e -> low-read-latency until z +a -> no-value-separation until z +b -> no-value-separation until z +c -> no-value-separation until z +d -> no-value-separation until z +e -> no-value-separation until z z -> none # Abutting policies. -test b-d:lowlatency d-f:latencytolerant f-h:lowlatency +test b-d:novalueseparation d-f:override f-h:novalueseparation a b c d e f g h i z ---- a -> until b -b -> low-read-latency until d -c -> low-read-latency until d -d -> latency-tolerant until f -e -> latency-tolerant until f -f -> low-read-latency until h -g -> low-read-latency until h +b -> no-value-separation until d +c -> no-value-separation until d +d -> override until f +e -> override until f +f -> no-value-separation until h +g -> no-value-separation until h h -> none i -> none z -> none # Gaps between policies. -test b-d:lowlatency h-j:latencytolerant +test b-d:novalueseparation h-j:override a b c d e f g h i j k l ---- a -> until b -b -> low-read-latency until d -c -> low-read-latency until d +b -> no-value-separation until d +c -> no-value-separation until d d -> until h e -> until h f -> until h g -> until h -h -> latency-tolerant until j -i -> latency-tolerant until j +h -> override until j +i -> override until j j -> none k -> none l -> none diff --git a/testdata/table_stats b/testdata/table_stats index 4fcc67ae4f..4056c2f3fb 100644 --- a/testdata/table_stats +++ b/testdata/table_stats @@ -980,7 +980,7 @@ range-deletions-bytes-estimate: 482 # Create a database with value separation enabled. -define format-major-version=24 value-separation=(enabled, min-size=1, max-ref-depth=10, rw-min-age=1m, garbage-ratios=0.1:0.3) target-file-sizes=(100000) block-size=32768 +define format-major-version=24 value-separation=(enabled, min-size=1, max-ref-depth=10, rw-min-age=1m, garbage-ratios=0.1:0.3, min-mvcc-garbage-size=1) target-file-sizes=(100000) block-size=32768 ---- batch diff --git a/value_separation.go b/value_separation.go index 114f682e62..9a1246e6ad 100644 --- a/value_separation.go +++ b/value_separation.go @@ -21,7 +21,7 @@ import ( // latencyTolerantMinimumSize is the minimum size, in bytes, of a value that // will be separated into a blob file when the value storage policy is -// ValueStorageLatencyTolerant. +// Override. const latencyTolerantMinimumSize = 10 var neverSeparateValues getValueSeparation = func(JobID, *tableCompaction, ValueStoragePolicy) compact.ValueSeparation { @@ -41,7 +41,7 @@ func (d *DB) determineCompactionValueSeparation( return compact.NeverSeparateValues{} } policy := d.opts.Experimental.ValueSeparationPolicy() - if !policy.Enabled || valueStorage == ValueStorageLowReadLatency { + if !policy.Enabled || valueStorage.PolicyAdjustment == NoValueSeparation { return compact.NeverSeparateValues{} } @@ -51,9 +51,9 @@ func (d *DB) determineCompactionValueSeparation( // This compaction should preserve existing blob references. kind := sstable.ValueSeparationDefault minSize := policy.MinimumSize - if valueStorage != ValueStorageDefault { + if valueStorage.PolicyAdjustment == Override { kind = sstable.ValueSeparationSpanPolicy - minSize = latencyTolerantMinimumSize + minSize = valueStorage.MinimumSize } return &preserveBlobReferences{ inputBlobPhysicalFiles: uniqueInputBlobMetadatas(&c.version.BlobFiles, c.inputs), @@ -64,6 +64,12 @@ func (d *DB) determineCompactionValueSeparation( } // This compaction should write values to new blob files. + minSize := policy.MinimumSize + mvccMinimumSize := policy.MinimumMVCCGarbageSize + if valueStorage.PolicyAdjustment == Override { + minSize = valueStorage.MinimumSize + mvccMinimumSize = valueStorage.MinimumMVCCGarbageSize + } return &writeNewBlobFiles{ comparer: d.opts.Comparer, newBlobObject: func() (objstorage.Writable, objstorage.ObjectMetadata, error) { @@ -72,8 +78,9 @@ func (d *DB) determineCompactionValueSeparation( }, shortAttrExtractor: d.opts.Experimental.ShortAttributeExtractor, writerOpts: d.makeBlobWriterOptions(c.outputLevel.level), - minimumSize: policy.MinimumSize, + minimumSize: minSize, globalMinimumSize: policy.MinimumSize, + mvccMinimumSize: mvccMinimumSize, invalidValueCallback: func(userKey []byte, value []byte, err error) { // The value may not be safe, so it will be redacted when redaction // is enabled. @@ -131,13 +138,13 @@ func shouldWriteBlobFiles( if !backingPropsValid { continue } - switch valueStorage { - case ValueStorageLowReadLatency: + switch valueStorage.PolicyAdjustment { + case NoValueSeparation: // This case should be handled prior to calling this function, // but include it here for completeness. return false, inputReferenceDepth - case ValueStorageLatencyTolerant: - if backingProps.ValueSeparationMinSize != latencyTolerantMinimumSize { + case Override: + if backingProps.ValueSeparationMinSize != uint64(valueStorage.MinimumSize) { return true, 0 } default: @@ -240,8 +247,12 @@ type writeNewBlobFiles struct { minimumSize int // globalMinimumSize is the size threshold for separating values into blob // files globally across the keyspace. It may be overridden per-output by - // SetNextOutputConfig. + // SetNextOutputConfig if minimumSize is different. globalMinimumSize int + // mvccMinimumSize is the minimum size of a value that will be separated into + // a blob file when the value is likely MVCC garbage; see comments in + // sstable.IsLikelyMVCCGarbage for the exact criteria we use. + mvccMinimumSize int // invalidValueCallback is called when a value is encountered for which the // short attribute extractor returns an error. invalidValueCallback func(userKey []byte, value []byte, err error) @@ -259,6 +270,7 @@ var _ compact.ValueSeparation = (*writeNewBlobFiles)(nil) // SetNextOutputConfig implements the ValueSeparation interface. func (vs *writeNewBlobFiles) SetNextOutputConfig(config compact.ValueSeparationOutputConfig) { vs.minimumSize = config.MinimumSize + vs.mvccMinimumSize = config.MinimumMVCCGarbageSize } // Kind implements the ValueSeparation interface. @@ -306,8 +318,10 @@ func (vs *writeNewBlobFiles) Add( vs.buf = v[:0] } + isLikelyMVCCGarbage = len(v) >= vs.mvccMinimumSize && isLikelyMVCCGarbage // Values that are too small are never separated; however, MVCC keys are - // separated if they are a SET key kind, as long as the value is not empty. + // separated if they are a SET key kind, as long as the value meets the + // mvccMinimumSize constraint. if len(v) < vs.minimumSize && !isLikelyMVCCGarbage { return tw.Add(kv.K, v, forceObsolete) }