From f8be11eaa1b71fa23ee17cd4440e37ff5a6f15de Mon Sep 17 00:00:00 2001
From: Victor Dumitrescu <victor.dumitrescu@nomadic-labs.com>
Date: Mon, 8 Jun 2026 10:55:30 +0200
Subject: [PATCH 1/2] test(dursto): add a standalone long tests binary

---
 Cargo.lock                                    |  1 +
 durable-storage/Cargo.toml                    | 10 ++
 durable-storage/Makefile                      |  1 +
 durable-storage/src/bin/database_long_test.rs | 95 +++++++++++++++++++
 durable-storage/src/long_test/mod.rs          | 77 ++++++++++++---
 5 files changed, 169 insertions(+), 15 deletions(-)
 create mode 100644 durable-storage/src/bin/database_long_test.rs
diff --git a/Cargo.lock b/Cargo.lock
index 23edba8af2..7ca9fd74ae 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2561,6 +2561,7 @@ dependencies = [
  "bincode",
  "bytes",
  "cfg-if",
+ "clap",
  "criterion",
  "derive_more",
  "goldenfile",
diff --git a/durable-storage/Cargo.toml b/durable-storage/Cargo.toml
index 97c08ada9e..84ce6b0c5c 100644
--- a/durable-storage/Cargo.toml
+++ b/durable-storage/Cargo.toml
@@ -10,6 +10,7 @@ workspace = true
 [features]
 default = ["rocksdb"]
 unstable-test-utils = [
+  "dep:clap",
   "dep:proptest",
   "dep:serde",
   "dep:serde_with",
@@ -44,6 +45,10 @@ optional = true
 workspace = true
 optional = true
 
+[dependencies.clap]
+workspace = true
+optional = true
+
 [dependencies.serde_json]
 workspace = true
 optional = true
@@ -72,6 +77,11 @@ octez-riscv-test-utils.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 
+[[bin]]
+name = "database_long_test"
+path = "src/bin/database_long_test.rs"
+required-features = ["rocksdb", "unstable-test-utils"]
+
 [[bench]]
 name = "avl_tree"
 harness = false
diff --git a/durable-storage/Makefile b/durable-storage/Makefile
index 4115becb10..ed96cce89b 100644
--- a/durable-storage/Makefile
+++ b/durable-storage/Makefile
@@ -8,6 +8,7 @@ all: check test
 check:
 	@cargo clippy --all-targets --no-default-features -- --deny warnings
 	@cargo clippy -p xtask --all-targets -- --deny warnings
+	@cargo clippy --features unstable-test-utils --bin database_long_test -- --deny warnings
 
 # distinct from the workspace-level test in that it actually turns off
 # rocksdb: using the in-memory implementation instead
diff --git a/durable-storage/src/bin/database_long_test.rs b/durable-storage/src/bin/database_long_test.rs
new file mode 100644
index 0000000000..64baa71ba6
--- /dev/null
+++ b/durable-storage/src/bin/database_long_test.rs
@@ -0,0 +1,95 @@
+// SPDX-FileCopyrightText: 2026 Nomadic Labs <contact@nomadic-labs.com>
+//
+// SPDX-License-Identifier: MIT
+
+//! Binary for the long-running durable storage [`Database`] test.
+//!
+//! See [`octez_riscv_durable_storage::long_test`] for details
+//! about long tests.
+//!
+//! [`Database`]: octez_riscv_durable_storage::database::Database
+
+use std::path::PathBuf;
+use std::time::Duration;
+
+use anyhow::Context;
+use anyhow::Result;
+use clap::Parser;
+use clap::Subcommand;
+use octez_riscv_data::hash::Hash;
+use octez_riscv_durable_storage::long_test::LongTestConfig;
+use octez_riscv_durable_storage::long_test::replay_failure;
+use octez_riscv_durable_storage::long_test::run_long_test;
+
+#[derive(Debug, Parser)]
+#[command(version, long_about = None)]
+struct Cli {
+    #[command(subcommand)]
+    command: Commands,
+}
+
+#[derive(Debug, Subcommand)]
+enum Commands {
+    /// Run the long test.
+    Test {
+        /// Target number of operations sampled per epoch.
+        #[arg(long, default_value_t = 1000)]
+        ops_per_epoch: usize,
+
+        /// Number of test cases per epoch.
+        #[arg(long, default_value_t = 256)]
+        cases_per_epoch: u32,
+
+        /// 32-byte hex-encoded test seed (default: run with a fresh seed, printed at startup).
+        #[arg(long)]
+        seed: Option<String>,
+
+        /// Time budget in minutes (default: none).
+        #[arg(long)]
+        max_minutes: Option<u64>,
+
+        /// Maximum number of epochs to run (default: run until the time budget).
+        #[arg(long)]
+        epochs: Option<u64>,
+    },
+    /// Replay the failing epoch described by `<DIR>/meta.json`.
+    Replay {
+        /// Failure directory containing the recorded artifacts.
+        dir: PathBuf,
+    },
+}
+
+fn main() -> Result<()> {
+    match Cli::parse().command {
+        Commands::Test {
+            ops_per_epoch,
+            cases_per_epoch,
+            seed,
+            max_minutes,
+            epochs,
+        } => {
+            let seed = match seed {
+                Some(seed) => {
+                    let bytes = hex::decode(&seed).context("decoding hex seed")?;
+                    let array: [u8; Hash::DIGEST_SIZE] =
+                        bytes.as_slice().try_into().map_err(|_| {
+                            anyhow::anyhow!("seed must be exactly 32 bytes ({} given)", bytes.len())
+                        })?;
+                    Some(Hash::from(array))
+                }
+                None => None,
+            };
+
+            let config = LongTestConfig {
+                ops_per_epoch,
+                cases_per_epoch,
+                seed,
+                time_budget: max_minutes.map(|m| Duration::from_secs(m * 60)),
+                epochs,
+            };
+
+            run_long_test(config)
+        }
+        Commands::Replay { dir } => replay_failure(&dir),
+    }
+}
diff --git a/durable-storage/src/long_test/mod.rs b/durable-storage/src/long_test/mod.rs
index edb3b1ff03..729b20dd9e 100644
--- a/durable-storage/src/long_test/mod.rs
+++ b/durable-storage/src/long_test/mod.rs
@@ -21,7 +21,6 @@ pub mod strategy;
 
 use std::fs;
 use std::path::Path;
-use std::path::PathBuf;
 use std::time::Duration;
 use std::time::Instant;
 
@@ -69,8 +68,6 @@ pub struct LongTestConfig {
     pub seed: Option<Hash>,
     /// Time budget. The loop stops cleanly once exceeded.
     pub time_budget: Option<Duration>,
-    /// If set, replay the failing epoch described by `<dir>/meta.json`.
-    pub replay: Option<PathBuf>,
 }
 
 /// Metadata persisted alongside a failure which enables replaying it.
@@ -94,11 +91,6 @@ struct FailureMeta {
 
 /// Run the long-running test
 pub fn run_long_test(config: LongTestConfig) -> Result<()> {
-    // Replay reconstructs only the failing epoch; it is handled separately.
-    if let Some(replay_dir) = &config.replay {
-        return replay_failure(replay_dir);
-    }
-
     let seed = config
         .seed
         .unwrap_or_else(|| rand::random::<[u8; 32]>().into());
@@ -110,9 +102,19 @@ pub fn run_long_test(config: LongTestConfig) -> Result<()> {
         .tempdir()?
         .keep();
 
-    eprintln!("test seed: {seed}");
+    let mut rerun = format!(
+        "cargo run --release --features rocksdb,unstable-test-utils --bin database_long_test -- \
+         test --seed {seed} --ops-per-epoch {ops_per_epoch} --cases-per-epoch {cases_per_epoch}"
+    );
+    if let Some(epochs) = max_epochs {
+        rerun.push_str(&format!(" --epochs {epochs}"));
+    }
+    if let Some(budget) = config.time_budget {
+        rerun.push_str(&format!(" --max-minutes {}", budget.as_secs() / 60));
+    }
     eprintln!(
-        "out-dir: {} | ops/epoch: {ops_per_epoch} | cases/epoch: {cases_per_epoch}",
+        "test directory: {} | ops/epoch: {ops_per_epoch} | cases/epoch: {cases_per_epoch}\n\
+         rerun with:\n{rerun}",
         out_dir.display(),
     );
 
@@ -171,6 +173,19 @@ pub fn run_long_test(config: LongTestConfig) -> Result<()> {
 
         match result {
             Ok(()) => {
+                // Size reporting only via the binary, not the crate test.
+                #[cfg(not(test))]
+                {
+                    let snapshot_dir = persistent_repo.database_commit_dir(&base.commit);
+                    let snapshot_size = dir_size(&snapshot_dir)
+                        .context("measuring the size of the latest snapshot")?;
+                    eprintln!(
+                        "epoch {epoch} ok ({} keys, latest snapshot: {:.2} MiB)",
+                        base.model.data.len(),
+                        snapshot_size as f64 / (1024.0 * 1024.0),
+                    );
+                }
+                #[cfg(test)]
                 eprintln!(
                     "epoch {epoch} ok (db contains {} entries)",
                     base.model.data.len()
@@ -208,9 +223,38 @@ pub fn run_long_test(config: LongTestConfig) -> Result<()> {
     }
 
     eprintln!("completed {epoch} epochs");
+
+    // Size reporting only via the binary, not the crate test.
+    #[cfg(not(test))]
+    {
+        drop(runtime);
+
+        let repo_size = dir_size(&repo_dir).context("measuring the size of the repo")?;
+        eprintln!(
+            "total repo size: {:.2} MiB",
+            repo_size as f64 / (1024.0 * 1024.0)
+        );
+    }
+
     Ok(())
 }
 
+/// Total size in bytes of all files under `dir`, recursively.
+#[cfg(not(test))]
+fn dir_size(dir: &Path) -> std::io::Result<u64> {
+    let mut size = 0;
+    for entry in fs::read_dir(dir)? {
+        let entry = entry?;
+        let metadata = entry.metadata()?;
+        if metadata.is_dir() {
+            size += dir_size(&entry.path())?;
+        } else {
+            size += metadata.len();
+        }
+    }
+    Ok(size)
+}
+
 /// Build a deterministically seeded test runner for `epoch`.
 fn epoch_runner(seed: Hash, epoch: u64, cases: u32) -> TestRunner {
     // XOR the epoch index into the seed so each epoch has a distinct yet
@@ -274,9 +318,11 @@ fn write_failure(
         .context("writing the persistent base snapshot")?;
 
     eprintln!(
-        "failure artifacts written to {}; replay with --replay {}",
-        failure_dir.display(),
-        failure_dir.display(),
+        "failure artifacts written to {failure}\n\
+         replay with:\n\
+         cargo run --release \
+         --features rocksdb,unstable-test-utils --bin database_long_test -- replay {failure}",
+        failure = failure_dir.display(),
     );
     Ok(())
 }
@@ -284,7 +330,7 @@ fn write_failure(
 /// Reproduce a recorded failure by reconstructing only the failing epoch.
 /// Both the persistence backend's base and the in-memory backend's base
 /// are restored from disk, and the saved (shrunk) operation sequence is applied once.
-fn replay_failure(dir: &Path) -> Result<()> {
+pub fn replay_failure(dir: &Path) -> Result<()> {
     fn read_failure_file<T: serde::de::DeserializeOwned>(
         failure_dir: &Path,
         name: &str,
@@ -374,6 +420,8 @@ fn replay_failure(dir: &Path) -> Result<()> {
 
 #[cfg(test)]
 mod tests {
+    use std::path::PathBuf;
+
     use bytes::Bytes;
     use octez_riscv_test_utils::TestableTmpdir;
     use tokio::runtime::Runtime;
@@ -390,7 +438,6 @@ mod tests {
             cases_per_epoch: 32,
             seed: None,
             time_budget: None,
-            replay: None,
         })
         .expect("the short long test run should succeed");
     }

From 7325f47a09dbed8e5151474cd29878c0f2a47872 Mon Sep 17 00:00:00 2001
From: Victor Dumitrescu <victor.dumitrescu@nomadic-labs.com>
Date: Mon, 8 Jun 2026 15:31:01 +0200
Subject: [PATCH 2/2] feat(ci): add a PR job which runs the long tests

---
 .github/workflows/ci.yml | 3 +++
 durable-storage/Makefile | 6 +++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 73d1a80afe..6f8379a2b7 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -53,6 +53,9 @@ jobs:
           - runs-on: macos-latest
             make-target: check
             title: MacOS Checks
+          - runs-on: ubuntu-latest
+            make-target: durable/long-test
+            title: Durable Storage Long Tests
 
     name: ${{ matrix.title }}
     runs-on: ${{ matrix.runs-on }}
diff --git a/durable-storage/Makefile b/durable-storage/Makefile
index ed96cce89b..3f032c984d 100644
--- a/durable-storage/Makefile
+++ b/durable-storage/Makefile
@@ -20,4 +20,8 @@ reset-regressions:
 	@cargo run -p xtask -- gen-database-regression-inputs
 	@UPDATE_GOLDENFILES=1 cargo nextest run test_database_regression
 
-.PHONY: all check test gen-regression-inputs
+long-test:
+	@cargo run --release --features rocksdb,unstable-test-utils \
+		--bin database_long_test -- test --max-minutes 10
+
+.PHONY: all check test gen-regression-inputs long-test