Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion bindings/python/fdb/tuple.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,14 @@ def _decode(v, pos):
ret.append(val)
return tuple(ret), end_pos + 1
else:
raise ValueError("Unknown data type in DB: " + repr(v))
# Enhanced error reporting for debugging upgrade issues
error_context = {
'unknown_code': hex(code) if code < 256 else 'invalid',
'position': pos,
'data_length': len(v),
'surrounding_bytes': v[max(0, pos-5):pos+10].hex() if pos < len(v) else 'N/A'
}
raise ValueError(f"Unknown data type in DB at position {pos}: code={hex(code)} context={error_context} data={repr(v)}")


def _reduce_children(child_values):
Expand Down
153 changes: 153 additions & 0 deletions fdbbackup/tests/test_1k_parallel.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#!/usr/bin/env bash

# 1K s3_backup_test.sh with LIMITED parallelism (2 concurrent instances)
# Based on proven simple_s3backup_test.sh pattern that works

set -euo pipefail

readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
readonly TOTAL_TESTS=1000
readonly MAX_CONCURRENT=2 # CRITICAL: Only 2 concurrent safe! 3+ causes failures!
readonly SOURCE_DIR="/Users/stack/checkouts/fdb/foundationdb"
readonly BUILD_DIR="/Users/stack/build_output"
readonly LOG_PREFIX="/tmp/s3backup_1k_parallel"

START_TIME=$(date)
START_TIMESTAMP=$(date +%s) # Store as timestamp for macOS compatibility
COMPLETED_TESTS=0
declare -a RUNNING_PIDS=()
declare -a RUNNING_IDS=()

log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
}

# Function to run a single test in background
start_test() {
local test_id="$1"
local scratch_dir logfile

scratch_dir=$(mktemp -d "/tmp/s3backup_parallel_${test_id}.XXXXXX")
logfile="${LOG_PREFIX}_${test_id}.log"

# Run s3_backup_test.sh in background
(
"${SCRIPT_DIR}/s3_backup_test.sh" "${SOURCE_DIR}" "${BUILD_DIR}" "${scratch_dir}" &> "${logfile}" || true
rm -rf "${scratch_dir}" 2>/dev/null || true
) &

local pid=$!
RUNNING_PIDS+=("$pid")
RUNNING_IDS+=("$test_id")

log "Started test ${test_id}/${TOTAL_TESTS} (PID: $pid)"
}

# Function to wait for one test to complete
wait_for_completion() {
# Wait for any background job to complete
wait -n
local exit_code=$?

# Find which process completed
local completed_idx=-1
for i in "${!RUNNING_PIDS[@]}"; do
local pid="${RUNNING_PIDS[$i]}"
if ! kill -0 "$pid" 2>/dev/null; then
completed_idx=$i
break
fi
done

if [[ $completed_idx -ge 0 ]]; then
local test_id="${RUNNING_IDS[$completed_idx]}"
local logfile="${LOG_PREFIX}_${test_id}.log"

# Check if test passed by looking for "PASSED" in log
if grep -q "PASSED test_s3_backup_and_restore" "${logfile}"; then
COMPLETED_TESTS=$((COMPLETED_TESTS + 1))
log "✅ Test ${test_id} PASSED (${COMPLETED_TESTS}/${TOTAL_TESTS})"
else
log "❌ Test ${test_id} FAILED - stopping execution"
log "Check log: ${logfile}"

# Kill remaining background processes
for pid in "${RUNNING_PIDS[@]}"; do
kill "$pid" 2>/dev/null || true
done
exit 1
fi

# Remove completed process from arrays
unset RUNNING_PIDS[$completed_idx]
unset RUNNING_IDS[$completed_idx]
RUNNING_PIDS=("${RUNNING_PIDS[@]}") # Reindex array
RUNNING_IDS=("${RUNNING_IDS[@]}") # Reindex array

# Progress update every 50 tests
if [[ $((COMPLETED_TESTS % 50)) -eq 0 ]]; then
local elapsed=$(($(date +%s) - START_TIMESTAMP))
local rate=$(( COMPLETED_TESTS * 60 / elapsed ))
log "Progress: ${COMPLETED_TESTS}/${TOTAL_TESTS} completed (${rate} tests/minute)"
fi
fi
}

log "=== S3 Backup Test 1K Parallel Runner ==="
log "Total tests to run: ${TOTAL_TESTS}"
log "Maximum concurrency: ${MAX_CONCURRENT} (CRITICAL LIMIT - DO NOT INCREASE)"
log "⚠️ WARNING: Each s3_backup_test.sh runs full FDB cluster (very resource intensive)"
log "⚠️ WARNING: 3+ concurrent instances cause failures due to resource exhaustion"
log "Start time: ${START_TIME}"

# Clean up any existing log files
rm -f "${LOG_PREFIX}"_*.log 2>/dev/null || true

# Run tests with limited parallelism
for test_id in $(seq 1 $TOTAL_TESTS); do
# Start new test if we have capacity
if [[ ${#RUNNING_PIDS[@]} -lt $MAX_CONCURRENT ]]; then
start_test "$test_id"

# Small delay to stagger startups
sleep 3
fi

# If we're at capacity, wait for one to complete
if [[ ${#RUNNING_PIDS[@]} -ge $MAX_CONCURRENT ]]; then
wait_for_completion
fi
done

# Wait for remaining tests to complete
log "Waiting for final ${#RUNNING_PIDS[@]} tests to complete..."
while [[ ${#RUNNING_PIDS[@]} -gt 0 ]]; do
wait_for_completion
done

# Final report
end_time=$(date)
total_elapsed=$(($(date +%s) - START_TIMESTAMP))

log "🎉 ALL ${TOTAL_TESTS} TESTS COMPLETED SUCCESSFULLY! 🎉"
log "Total time: $((total_elapsed / 3600))h $(((total_elapsed % 3600) / 60))m $((total_elapsed % 60))s"
log "Success rate: 100%"
log "Parallelism: ${MAX_CONCURRENT} concurrent tests"

# Generate proof file
proof_file="/tmp/s3backup_1k_parallel_proof.txt"
{
echo "=== S3 Backup Test 1K Parallel Execution Proof ==="
echo "Start Time: ${START_TIME}"
echo "End Time: ${end_time}"
echo "Total Duration: $((total_elapsed / 3600))h $(((total_elapsed % 3600) / 60))m $((total_elapsed % 60))s"
echo "Tests Completed: ${COMPLETED_TESTS}/${TOTAL_TESTS}"
echo "Concurrency Level: ${MAX_CONCURRENT}"
echo "Success Rate: 100%"
echo "Log Files: $(ls "${LOG_PREFIX}"_*.log | wc -l)"
echo "Average Rate: $(( COMPLETED_TESTS * 3600 / total_elapsed )) tests/hour"
echo "Completion: $(date)"
} > "${proof_file}"

log "Proof written to: ${proof_file}"
exit 0
153 changes: 153 additions & 0 deletions fdbclient/tests/README_concurrent_s3client_test.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# Concurrent S3Client Test Script

## Overview

The `concurrent_s3client_test.sh` script is designed to test the robustness of the s3client_test.sh script when multiple instances are run concurrently. It specifically tests:

1. **Port conflict handling** - Ensures that when multiple MockS3Server instances try to bind to the same port, the retry logic works correctly
2. **Process cleanup** - Verifies that all spawned processes are properly cleaned up, preventing orphaned MockS3Server processes
3. **Concurrent execution** - Tests that multiple s3client_test.sh instances can run simultaneously without interfering with each other
4. **Signal handling** - Ensures proper cleanup when tests are interrupted

## Usage

```bash
./concurrent_s3client_test.sh <build_dir> [max_concurrent] [test_duration]
```

### Parameters

- `build_dir` (required): Path to your FoundationDB build output directory (e.g., `~/build_output`)
- `max_concurrent` (optional): Maximum number of concurrent test instances to run (default: 5)
- `test_duration` (optional): Maximum time in seconds to wait for tests to complete (default: 60)

### Examples

```bash
# Basic usage with default settings (5 concurrent tests, 60s timeout)
./concurrent_s3client_test.sh ~/build_output

# Run 10 concurrent tests with 2-minute timeout
./concurrent_s3client_test.sh ~/build_output 10 120

# Run 3 concurrent tests with 30-second timeout
./concurrent_s3client_test.sh ~/build_output 3 30
```

## Test Scenarios

The script runs three main test scenarios:

### 1. Basic Concurrent Execution
- Starts multiple s3client_test.sh instances with staggered delays
- Tests normal concurrent operation
- Verifies all instances complete successfully

### 2. Port Occupation Simulation
- Pre-occupies some ports in the range that MockS3Server uses (8080-8090)
- Starts test instances that must find alternative ports
- Verifies the port retry logic works correctly

### 3. Signal Handling Test
- Starts test instances and then sends SIGTERM signals
- Verifies that cleanup happens properly when tests are interrupted
- Checks for orphaned processes

## What It Tests

### Port Conflict Resolution
The script verifies that the [`mocks3_fixture.sh`](mocks3_fixture.sh:75-141) port retry logic works correctly:
- When port 8080 is occupied, MockS3Server should try 8081, 8082, etc.
- Multiple instances should be able to find available ports
- Port conflicts should not cause test failures

### Process Cleanup
The script monitors for:
- All MockS3Server processes are terminated when tests complete
- No orphaned processes remain after test completion
- Proper cleanup happens even when tests are killed with signals

### Concurrent Safety
The script tests:
- Multiple s3client_test.sh instances can run simultaneously
- Tests don't interfere with each other's scratch directories
- Log files are properly isolated per instance

## Output and Reporting

The script provides:

### Real-time Logging
- Timestamped log messages showing test progress
- Port usage monitoring
- Process lifecycle tracking

### Final Report
- Summary of all test results
- Count of tests started, completed, failed, and killed
- Number of port conflicts detected
- Detection of any orphaned processes

### Detailed Report File
A detailed report is written to `/tmp/concurrent_s3client_test_report.txt` containing:
- Individual test instance details
- Port usage analysis
- Log file locations and sizes
- Complete test timeline

## Exit Codes

- `0`: All tests passed successfully with proper cleanup
- `1`: Some tests completed successfully but with issues
- `2`: Tests failed or cleanup problems detected

## Requirements

The script requires:
- Bash 4.0 or later
- Standard Unix utilities (`netstat` or `ss`, `pgrep`, `pkill`)
- The s3client_test.sh script must be executable
- A valid FoundationDB build directory with fdbserver binary

## Port Range

The script uses ports 8080-8090 by default (configurable via `BASE_PORT` and `MAX_PORT_RETRIES` constants).

## Troubleshooting

### "Port already in use" errors
This is expected behavior when testing port conflict resolution. The script should handle these automatically.

### Orphaned processes
If the script detects orphaned MockS3Server processes, it will:
1. Report them in the logs
2. Attempt to kill them automatically
3. Mark the test as failed

### Permission issues
Ensure the script has execute permissions:
```bash
chmod +x concurrent_s3client_test.sh
```

### Missing dependencies
The script will fail if required binaries are missing. Ensure:
- `fdbserver` exists in `<build_dir>/bin/fdbserver`
- Standard Unix utilities are available
- The system supports process monitoring commands

## Integration with CI/CD

This script can be integrated into continuous integration pipelines to:
- Verify port handling robustness under load
- Catch process cleanup regressions
- Test concurrent execution scenarios
- Validate signal handling behavior

Example CI usage:
```bash
# Run quick concurrent test
./concurrent_s3client_test.sh "$BUILD_DIR" 3 30

# Run stress test
./concurrent_s3client_test.sh "$BUILD_DIR" 10 180
Loading