expire stale groups. use rwmutexes where appropriate

agouin · agouin · commit d7f28d05e283 · 2024-12-05T23:56:52.000-07:00
diff --git a/gturbine/gtshred/benchmark_test.go b/gturbine/gtshred/benchmark_test.go
@@ -1,6 +1,7 @@
 package gtshred
 
 import (
+	"context"
 	"crypto/rand"
 	"testing"
 	"time"
@@ -18,10 +19,10 @@ func BenchmarkShredProcessing(b *testing.B) {
 		size      int
 		chunkSize uint32
 	}{
-		{"8MB", 8 << 20, 1 << 18},    // 256KB chunks
-		{"16MB", 16 << 20, 1 << 19},  // 512KB chunks
-		{"32MB", 32 << 20, 1 << 20},  // 1MB chunks
-		{"64MB", 64 << 20, 1 << 21},  // 2MB chunks
+		{"8MB", 8 << 20, 1 << 18},     // 256KB chunks
+		{"16MB", 16 << 20, 1 << 19},   // 512KB chunks
+		{"32MB", 32 << 20, 1 << 20},   // 1MB chunks
+		{"64MB", 64 << 20, 1 << 21},   // 2MB chunks
 		{"128MB", 128 << 20, 1 << 22}, // 4MB chunks
 	}
 
@@ -35,7 +36,7 @@ func BenchmarkShredProcessing(b *testing.B) {
 			}
 
 			// Create processor with noop callback
-			p := NewProcessor(&noopCallback{}, time.Minute)
+			p := NewProcessor(context.Background(), &noopCallback{}, time.Minute)
 
 			// Reset timer before main benchmark loop
 			b.ResetTimer()
@@ -56,7 +57,7 @@ func BenchmarkShredProcessing(b *testing.B) {
 
 				b.StopTimer()
 				// Reset processor state between iterations
-				p.groups = make(map[string]*ShredGroup)
+				p.groups = make(map[string]*ShredGroupWithTimestamp)
 				p.completedBlocks = make(map[string]time.Time)
 				b.StartTimer()
 			}
@@ -87,7 +88,7 @@ func BenchmarkShredReconstruction(b *testing.B) {
 	for _, pattern := range patterns {
 		b.Run(pattern.name, func(b *testing.B) {
 			// Create processor
-			p := NewProcessor(&noopCallback{}, time.Minute)
+			p := NewProcessor(context.Background(), &noopCallback{}, time.Minute)
 
 			b.ResetTimer()
 
@@ -121,10 +122,10 @@ func BenchmarkShredReconstruction(b *testing.B) {
 				}
 
 				b.StopTimer()
-				p.groups = make(map[string]*ShredGroup)
+				p.groups = make(map[string]*ShredGroupWithTimestamp)
 				p.completedBlocks = make(map[string]time.Time)
 				b.StartTimer()
 			}
 		})
 	}
-}
+}
diff --git a/gturbine/gtshred/process_shred_test.go b/gturbine/gtshred/process_shred_test.go
@@ -2,6 +2,7 @@ package gtshred
 
 import (
 	"bytes"
+	"context"
 	"crypto/rand"
 	"crypto/sha256"
 	"testing"
@@ -85,7 +86,7 @@ func TestProcessorShredding(t *testing.T) {
 		t.Run(tc.name, func(t *testing.T) {
 			var cb = new(testProcessorCallback)
 
-			p := NewProcessor(cb, time.Minute)
+			p := NewProcessor(context.Background(), cb, time.Minute)
 
 			block := makeRandomBlock(tc.blockSize)
 			group, err := NewShredGroup(block, TestHeight, DefaultDataShreds, DefaultRecoveryShreds, DefaultChunkSize)
diff --git a/gturbine/gtshred/processor.go b/gturbine/gtshred/processor.go
@@ -1,6 +1,7 @@
 package gtshred
 
 import (
+	"context"
 	"fmt"
 	"sync"
 	"time"
@@ -16,22 +17,37 @@ const (
 	maxBlockSize = 128 * 1024 * 1024 // 128MB maximum block size (matches Solana)
 )
 
+type ShredGroupWithTimestamp struct {
+	ShredGroup
+	Timestamp time.Time
+}
+
 type Processor struct {
-	groups          map[string]*ShredGroup
-	mu              sync.Mutex
-	cb              ProcessorCallback
-	completedBlocks map[string]time.Time
+	// cb is the callback to call when a block is fully reassembled
+	cb ProcessorCallback
+
+	// groups is a cache of shred groups currently being processed.
+	groups   map[string]*ShredGroupWithTimestamp
+	groupsMu sync.RWMutex
+
+	// completedBlocks is a cache of block hashes that have been fully reassembled and should no longer be processed.
+	completedBlocks   map[string]time.Time
+	completedBlocksMu sync.RWMutex
+
+	// cleanupInterval is the interval at which stale groups are cleaned up and completed blocks are removed
 	cleanupInterval time.Duration
 }
 
+// ProcessorCallback is the interface for processor callbacks.
 type ProcessorCallback interface {
 	ProcessBlock(height uint64, blockHash []byte, block []byte) error
 }
 
-func NewProcessor(cb ProcessorCallback, cleanupInterval time.Duration) *Processor {
+// NewProcessor creates a new Processor with the given callback and cleanup interval.
+func NewProcessor(ctx context.Context, cb ProcessorCallback, cleanupInterval time.Duration) *Processor {
 	p := &Processor{
 		cb:              cb,
-		groups:          make(map[string]*ShredGroup),
+		groups:          make(map[string]*ShredGroupWithTimestamp),
 		completedBlocks: make(map[string]time.Time),
 		cleanupInterval: cleanupInterval,
 	}
@@ -43,6 +59,8 @@ func NewProcessor(cb ProcessorCallback, cleanupInterval time.Duration) *Processo
 
 		for {
 			select {
+			case <-ctx.Done():
+				return
 			case now := <-ticker.C:
 				p.cleanupStaleGroups(now)
 			}
@@ -58,28 +76,42 @@ func (p *Processor) CollectShred(shred *gturbine.Shred) error {
 		return fmt.Errorf("nil shred")
 	}
 
+	p.completedBlocksMu.RLock()
 	// Skip shreds from already processed blocks
-	if _, completed := p.completedBlocks[string(shred.BlockHash)]; completed {
+	_, completed := p.completedBlocks[string(shred.BlockHash)]
+	p.completedBlocksMu.RUnlock()
+	if completed {
 		return nil
 	}
 
-	p.mu.Lock()
-	defer p.mu.Unlock()
+	// Take read lock on groups to check if group exists, and get it if it does.
+	p.groupsMu.RLock()
 	group, ok := p.groups[shred.GroupID]
+	p.groupsMu.RUnlock()
+
 	if !ok {
-		group := &ShredGroup{
-			DataShreds:          make([]*gturbine.Shred, shred.TotalDataShreds),
-			RecoveryShreds:      make([]*gturbine.Shred, shred.TotalRecoveryShreds),
-			TotalDataShreds:     shred.TotalDataShreds,
-			TotalRecoveryShreds: shred.TotalRecoveryShreds,
-			GroupID:             shred.GroupID,
-			BlockHash:           shred.BlockHash,
-			Height:              shred.Height,
-			OriginalSize:        shred.FullDataSize,
+		// If the group doesn't exist, create it and add the shred
+		group := &ShredGroupWithTimestamp{
+			ShredGroup: ShredGroup{
+				DataShreds:          make([]*gturbine.Shred, shred.TotalDataShreds),
+				RecoveryShreds:      make([]*gturbine.Shred, shred.TotalRecoveryShreds),
+				TotalDataShreds:     shred.TotalDataShreds,
+				TotalRecoveryShreds: shred.TotalRecoveryShreds,
+				GroupID:             shred.GroupID,
+				BlockHash:           shred.BlockHash,
+				Height:              shred.Height,
+				OriginalSize:        shred.FullDataSize,
+			},
+			Timestamp: time.Now(), // Record the time the group was created consumer side.
 		}
+
 		group.DataShreds[shred.Index] = shred
 
+		// Take write lock to add the group
+		p.groupsMu.Lock()
 		p.groups[shred.GroupID] = group
+		p.groupsMu.Unlock()
+
 		return nil
 	}
 
@@ -111,19 +143,51 @@ func (p *Processor) CollectShred(shred *gturbine.Shred) error {
 }
 
 func (p *Processor) cleanupStaleGroups(now time.Time) {
-	p.mu.Lock()
-	defer p.mu.Unlock()
+	var deleteHashes []string
 
+	p.completedBlocksMu.RLock()
 	for hash, completedAt := range p.completedBlocks {
 		if now.Sub(completedAt) > p.cleanupInterval {
+			deleteHashes = append(deleteHashes, hash)
+		}
+	}
+	p.completedBlocksMu.RUnlock()
+
+	if len(deleteHashes) != 0 {
+		// Take write lock once for all deletions
+		p.completedBlocksMu.Lock()
+		for _, hash := range deleteHashes {
 			delete(p.completedBlocks, hash)
-			// Find and reset any groups with this block hash
-			for id, group := range p.groups {
-				if string(group.BlockHash) == hash {
-					group.Reset()
-					delete(p.groups, id)
-				}
+		}
+		p.completedBlocksMu.Unlock()
+	}
+
+	var deleteGroups []string
+
+	// Take read lock on groups to check for groups to delete (stale or duplicate blockhash)
+	p.groupsMu.RLock()
+	for id, group := range p.groups {
+		for _, hash := range deleteHashes {
+			// Check if group is associated with a completed block
+			if string(group.BlockHash) == hash {
+				deleteGroups = append(deleteGroups, id)
 			}
 		}
+
+		// Check if group is stale
+		if now.Sub(group.Timestamp) > p.cleanupInterval {
+			deleteGroups = append(deleteGroups, id)
+		}
+	}
+	p.groupsMu.RUnlock()
+
+	if len(deleteGroups) != 0 {
+		// Take write lock once for all deletions
+		p.groupsMu.Lock()
+		for _, id := range deleteGroups {
+			p.groups[id].Reset() // TODO: is this necessary?
+			delete(p.groups, id)
+		}
+		p.groupsMu.Unlock()
 	}
 }
diff --git a/gturbine/gtshred/processor_test.go b/gturbine/gtshred/processor_test.go
@@ -1,6 +1,7 @@
 package gtshred
 
 import (
+	"context"
 	"testing"
 	"time"
 )
@@ -9,7 +10,7 @@ func TestProcessorMemoryCleanup(t *testing.T) {
 	// Create processor with short cleanup interval for testing
 	var cb = new(testProcessorCallback)
 	cleanupInterval := 100 * time.Millisecond
-	p := NewProcessor(cb, cleanupInterval)
+	p := NewProcessor(context.Background(), cb, cleanupInterval)
 
 	// Create a test block and shred group
 	block := []byte("test block data")
diff --git a/gturbine/gtshred/shred_group.go b/gturbine/gtshred/shred_group.go
@@ -3,6 +3,7 @@ package gtshred
 import (
 	"crypto/sha256"
 	"fmt"
+	"sync"
 
 	"github.com/google/uuid"
 	"github.com/gordian-engine/gordian/gturbine"
@@ -19,6 +20,8 @@ type ShredGroup struct {
 	BlockHash           []byte
 	Height              uint64 // Added to struct level
 	OriginalSize        int
+
+	mu sync.Mutex
 }
 
 // NewShredGroup creates a new ShredGroup from a block of data
@@ -116,15 +119,15 @@ func NewShredGroup(block []byte, height uint64, dataShreds, recoveryShreds int,
 	return group, nil
 }
 
-// IsFull checks if enough shreds are available for reconstruction
-// NOTE: we'd like shredgroup to know the data threshold as a property on the shredgroup
-func (g *ShredGroup) IsFull() bool {
+// isFull checks if enough shreds are available to attempt reconstruction.
+func (g *ShredGroup) isFull() bool {
 	valid := 0
 	for _, s := range g.DataShreds {
 		if s != nil {
 			valid++
 		}
 	}
+
 	for _, s := range g.RecoveryShreds {
 		if s != nil {
 			valid++
@@ -136,6 +139,8 @@ func (g *ShredGroup) IsFull() bool {
 
 // ReconstructBlock attempts to reconstruct the original block from available shreds
 func (g *ShredGroup) ReconstructBlock(encoder *gtencoding.Encoder) ([]byte, error) {
+	g.mu.Lock()
+	defer g.mu.Unlock()
 
 	// Extract data bytes for erasure coding
 	allBytes := make([][]byte, len(g.DataShreds)+len(g.RecoveryShreds))
@@ -178,6 +183,7 @@ func (g *ShredGroup) ReconstructBlock(encoder *gtencoding.Encoder) ([]byte, erro
 	}
 
 	// Verify reconstructed block hash
+	// TODO hasher should be interface.
 	computedHash := sha256.Sum256(reconstructed)
 	if string(computedHash[:]) != string(g.BlockHash) {
 		return nil, fmt.Errorf("block hash mismatch after reconstruction")
@@ -203,6 +209,9 @@ func (g *ShredGroup) CollectShred(shred *gturbine.Shred) (bool, error) {
 		return false, fmt.Errorf("block hash mismatch")
 	}
 
+	g.mu.Lock()
+	defer g.mu.Unlock()
+
 	switch shred.Type {
 	case gturbine.DataShred:
 		// Validate shred index
@@ -222,7 +231,7 @@ func (g *ShredGroup) CollectShred(shred *gturbine.Shred) (bool, error) {
 		return false, fmt.Errorf("invalid shred type: %d", shred.Type)
 	}
 
-	return g.IsFull(), nil
+	return g.isFull(), nil
 }
 
 // Reset clears the ShredGroup data while maintaining allocated memory
diff --git a/gturbine/turbine_test.go b/gturbine/turbine_test.go
@@ -2,6 +2,7 @@ package gturbine_test
 
 import (
 	"bytes"
+	"context"
 	"fmt"
 	"net"
 	"sync"
@@ -65,7 +66,7 @@ func newTestNode(t *testing.T, basePort int) *testNode {
 
 	cb := &testBlockHandler{}
 
-	processor := gtshred.NewProcessor(cb, time.Minute)
+	processor := gtshred.NewProcessor(context.Background(), cb, time.Minute)
 
 	shredHandler := &testShredHandler{}
 	node := &testNode{