diff --git a/Cargo.lock b/Cargo.lock
index dd674229d4..d69f95ed83 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -26,6 +26,12 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "allocator-api2"
+version = "0.2.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "45862d1c77f2228b9e10bc609d5bc203d86ebc9b87ad8d5d5167a6c9abf739d9"
+
 [[package]]
 name = "anes"
 version = "0.1.6"
@@ -669,6 +675,7 @@ checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
 dependencies = [
  "futures-channel",
  "futures-core",
+ "futures-executor",
  "futures-io",
  "futures-sink",
  "futures-task",
@@ -691,6 +698,17 @@ version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
 
+[[package]]
+name = "futures-executor"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
+dependencies = [
+ "futures-core",
+ "futures-task",
+ "futures-util",
+]
+
 [[package]]
 name = "futures-io"
 version = "0.3.31"
@@ -732,10 +750,13 @@ version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
 dependencies = [
+ "futures-channel",
  "futures-core",
+ "futures-io",
  "futures-macro",
  "futures-sink",
  "futures-task",
+ "memchr",
  "pin-project-lite",
  "pin-utils",
  "slab",
@@ -821,6 +842,15 @@ version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
 
+[[package]]
+name = "home"
+version = "0.5.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "indenter"
 version = "0.3.3"
@@ -1139,12 +1169,20 @@ dependencies = [
 [[package]]
 name = "miden-gpu"
 version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "04742d5184bd76e7b92afcd9ca36b7c587a555d5fc4b27bb8f9d64ac57151f9c"
 dependencies = [
+ "allocator-api2",
+ "cc",
  "metal",
+ "miden-crypto",
  "once_cell",
+ "rayon",
+ "sppark",
+ "thiserror 2.0.11",
+ "tracing",
+ "winter-air",
  "winter-math",
+ "winter-prover",
+ "winterfell",
 ]
 
 [[package]]
@@ -1226,6 +1264,7 @@ dependencies = [
  "miden-gpu",
  "miden-processor",
  "pollster",
+ "serial_test",
  "tracing",
  "winter-maybe-async",
  "winter-prover",
@@ -1942,6 +1981,15 @@ dependencies = [
  "winapi-util",
 ]
 
+[[package]]
+name = "scc"
+version = "2.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "66b202022bb57c049555430e11fc22fea12909276a80a4c3d368da36ac1d88ed"
+dependencies = [
+ "sdd",
+]
+
 [[package]]
 name = "scoped-tls"
 version = "1.0.1"
@@ -1954,6 +2002,12 @@ version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 
+[[package]]
+name = "sdd"
+version = "3.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49c1eeaf4b6a87c7479688c6d52b9f1153cedd3c489300564f932b065c6eab95"
+
 [[package]]
 name = "semver"
 version = "0.9.0"
@@ -2016,6 +2070,31 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "serial_test"
+version = "3.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b258109f244e1d6891bf1053a55d63a5cd4f8f4c30cf9a1280989f80e7a1fa9"
+dependencies = [
+ "futures",
+ "log",
+ "once_cell",
+ "parking_lot",
+ "scc",
+ "serial_test_derive",
+]
+
+[[package]]
+name = "serial_test_derive"
+version = "3.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d69265a08751de7844521fd15003ae0a888e035773ba05695c5c759a6f89eef"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "sha2"
 version = "0.10.8"
@@ -2085,6 +2164,16 @@ version = "0.9.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
 
+[[package]]
+name = "sppark"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac5090642d9ae844edd9a5c23cb1fce6724ca88d50c55178ee8d1f656df5826b"
+dependencies = [
+ "cc",
+ "which",
+]
+
 [[package]]
 name = "stable_deref_trait"
 version = "1.2.0"
@@ -2618,6 +2707,18 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "which"
+version = "4.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
+dependencies = [
+ "either",
+ "home",
+ "once_cell",
+ "rustix",
+]
+
 [[package]]
 name = "winapi"
 version = "0.3.9"
@@ -2722,6 +2823,15 @@ dependencies = [
  "windows-targets 0.48.5",
 ]
 
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.6",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.59.0"
@@ -2963,6 +3073,17 @@ dependencies = [
  "winter-utils",
 ]
 
+[[package]]
+name = "winterfell"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6bdcd01333bbf4a349d8d13f269281524bd6d1a36ae3a853187f0665bf1cfd4"
+dependencies = [
+ "winter-air",
+ "winter-prover",
+ "winter-verifier",
+]
+
 [[package]]
 name = "yansi"
 version = "1.0.1"
diff --git a/Makefile b/Makefile
index 92636ec556..5a4865b7e2 100644
--- a/Makefile
+++ b/Makefile
@@ -11,7 +11,8 @@ DEBUG_ASSERTIONS=RUSTFLAGS="-C debug-assertions"
 FEATURES_CONCURRENT_EXEC=--features concurrent,executable
 FEATURES_LOG_TREE=--features concurrent,executable,tracing-forest
 FEATURES_METAL_EXEC=--features concurrent,executable,metal
-ALL_FEATURES_BUT_ASYNC=--features concurrent,executable,metal,testing,with-debug-info,internal
+FEATURES_CUDA_EXEC=--features concurrent,executable,cuda
+ALL_FEATURES_BUT_ASYNC=--features concurrent,executable,metal,cuda,testing,with-debug-info,internal
 
 # -- linting --------------------------------------------------------------------------------------
 
@@ -100,6 +101,10 @@ exec-single: ## Builds a single-threaded executable
 exec-metal: ## Builds an executable with Metal acceleration enabled
 	cargo build --profile optimized $(FEATURES_METAL_EXEC)
 
+.PHONY: exec-cuda
+exec-cuda: ## Builds an executable with CUDA acceleration enabled
+	RUSTFLAGS="-C target-feature=+avx2" cargo build --profile optimized $(FEATURES_CUDA_EXEC)
+
 .PHONY: exec-avx2
 exec-avx2: ## Builds an executable with AVX2 acceleration enabled
 	RUSTFLAGS="-C target-feature=+avx2" cargo build --profile optimized $(FEATURES_CONCURRENT_EXEC)
diff --git a/air/src/options.rs b/air/src/options.rs
index bd1a5e6e56..e38f7a0bc9 100644
--- a/air/src/options.rs
+++ b/air/src/options.rs
@@ -129,6 +129,23 @@ impl ProvingOptions {
         self
     }
 
+    /// Sets partitions for this [ProvingOptions].
+    ///
+    /// Partitions can be provided to split traces during proving and distribute work across
+    /// multiple devices. The number of partitions should be equal to the number of devices.
+    pub const fn with_partitions(mut self, num_partitions: usize) -> Self {
+        // All currently supported hash functions consume 8 felts per iteration.
+        // Match statement ensures that future changes to available hashes are reflected here.
+        let hash_rate = match self.hash_fn {
+            HashFunction::Blake3_192 => 8,
+            HashFunction::Blake3_256 => 8,
+            HashFunction::Rpo256 => 8,
+            HashFunction::Rpx256 => 8,
+        };
+        self.proof_options = self.proof_options.with_partitions(num_partitions, hash_rate);
+        self
+    }
+
     // PUBLIC ACCESSORS
     // --------------------------------------------------------------------------------------------
 
diff --git a/docs/src/intro/usage.md b/docs/src/intro/usage.md
index 27cdb7973f..8b20770cf1 100644
--- a/docs/src/intro/usage.md
+++ b/docs/src/intro/usage.md
@@ -38,12 +38,20 @@ Internally, Miden VM uses [rayon](https://github.com/rayon-rs/rayon) for paralle
 
 ### GPU acceleration
 
-Miden VM proof generation can be accelerated via GPUs. Currently, GPU acceleration is enabled only on Apple Silicon hardware (via [Metal](<https://en.wikipedia.org/wiki/Metal_(API)>)). To compile Miden VM with Metal acceleration enabled, you can run the following command:
+Miden VM proof generation can be accelerated via GPUs. Currently, GPU acceleration is enabled on Apple Silicon hardware (via [Metal](<https://en.wikipedia.org/wiki/Metal_(API)>)) and Nvidia GPUs (via [Cuda](https://en.wikipedia.org/wiki/CUDA)).
+
+To compile Miden VM with Metal acceleration enabled, you can run the following command:
 
 ```shell
 make exec-metal
 ```
 
+To compile Miden VM with CUDA acceleration enabled, you can run the following command:
+
+```shell
+make exec-cuda
+```
+
 Similar to `make exec` command, this will place the resulting `miden` executable into the `./target/optimized` directory.
 
 Currently, GPU acceleration is applicable only to recursive proofs which can be generated using the `-r` flag.
@@ -66,7 +74,7 @@ make exec-sve
 
 This will place the resulting `miden` executable into the `./target/optimized` directory.
 
-Similar to Metal acceleration, SVE/AVX2 acceleration is currently applicable only to recursive proofs which can be generated using the `-r` flag.
+Similar to GPU acceleration, SVE/AVX2 acceleration is currently applicable only to recursive proofs which can be generated using the `-r` flag.
 
 ### Running Miden VM
 
diff --git a/miden/Cargo.toml b/miden/Cargo.toml
index f4f9031f3c..22d4b8483a 100644
--- a/miden/Cargo.toml
+++ b/miden/Cargo.toml
@@ -55,6 +55,7 @@ executable = [
     "dep:rustyline",
     "dep:tracing-subscriber",
 ]
+cuda = ["prover/cuda", "std"]
 metal = ["prover/metal", "std"]
 std = ["assembly/std", "processor/std", "prover/std", "verifier/std"]
 # For internal use, not meant to be used by users
diff --git a/miden/README.md b/miden/README.md
index 710e1305dd..4ddffbe679 100644
--- a/miden/README.md
+++ b/miden/README.md
@@ -250,6 +250,9 @@ make exec
 # build an executable for Apple silicon (concurrent+metal)
 make exec-metal
 
+# build an executable for Nvidia GPU (concurrent+cuda)
+make exec-cuda
+
 # build an executable for targets with AVX2 instructions (concurrent)
 make exec-avx2
 
diff --git a/miden/src/cli/prove.rs b/miden/src/cli/prove.rs
index 1a0d5a59ea..51786f4b02 100644
--- a/miden/src/cli/prove.rs
+++ b/miden/src/cli/prove.rs
@@ -4,6 +4,9 @@ use assembly::diagnostics::{IntoDiagnostic, Report, WrapErr};
 use clap::Parser;
 use miden_vm::{internal::InputFile, ProvingOptions};
 use processor::{DefaultHost, ExecutionOptions, ExecutionOptionsError, Program};
+#[cfg(all(target_arch = "x86_64", feature = "cuda"))]
+use prover::cuda::get_num_of_gpus;
+use prover::Prover;
 use stdlib::StdLibrary;
 use tracing::instrument;
 
@@ -65,6 +68,11 @@ impl ProveCmd {
     pub fn get_proof_options(&self) -> Result<ProvingOptions, ExecutionOptionsError> {
         let exec_options =
             ExecutionOptions::new(Some(self.max_cycles), self.expected_cycles, self.trace, false)?;
+
+        let partitions = 1;
+        #[cfg(all(target_arch = "x86_64", feature = "cuda"))]
+        let partitions = get_num_of_gpus();
+
         Ok(match self.security.as_str() {
             "96bits" => {
                 if self.rpx {
@@ -82,7 +90,8 @@ impl ProveCmd {
             },
             other => panic!("{} is not a valid security setting", other),
         }
-        .with_execution_options(exec_options))
+        .with_execution_options(exec_options)
+        .with_partitions(partitions))
     }
 
     pub fn execute(&self) -> Result<(), Report> {
@@ -105,10 +114,11 @@ impl ProveCmd {
             self.get_proof_options().map_err(|err| Report::msg(format!("{err}")))?;
 
         // execute program and generate proof
-        let (stack_outputs, proof) =
-            prover::prove(&program, stack_inputs, &mut host, proving_options)
-                .into_diagnostic()
-                .wrap_err("Failed to prove program")?;
+        let mut prover = Prover::new();
+        let (stack_outputs, proof) = prover
+            .prove(&program, stack_inputs, &mut host, proving_options)
+            .into_diagnostic()
+            .wrap_err("Failed to prove program")?;
 
         println!(
             "Program with hash {} proved in {} ms",
diff --git a/miden/src/lib.rs b/miden/src/lib.rs
index 61e85e5002..c8ce95d84d 100644
--- a/miden/src/lib.rs
+++ b/miden/src/lib.rs
@@ -15,7 +15,7 @@ pub use processor::{
     ProgramInfo, StackInputs, VmState, VmStateIterator, ZERO,
 };
 pub use prover::{
-    math, prove, Digest, ExecutionProof, FieldExtension, HashFunction, InputError, Proof,
+    math, Digest, ExecutionProof, FieldExtension, HashFunction, InputError, Proof, Prover,
     ProvingOptions, StackOutputs, Word,
 };
 pub use verifier::{verify, VerificationError};
diff --git a/processor/src/errors.rs b/processor/src/errors.rs
index 4851ff83d1..78a7006a9d 100644
--- a/processor/src/errors.rs
+++ b/processor/src/errors.rs
@@ -92,6 +92,8 @@ pub enum ExecutionError {
     NoMastForestWithProcedure { root_digest: Digest },
     #[error("memory address cannot exceed 2^32 but was {0}")]
     MemoryAddressOutOfBounds(u64),
+    #[error("VM exceeded the memory usage limit {0}")]
+    MemoryLimitExceeded(usize),
     #[error(
         "word memory access at address {addr} in context {ctx} is unaligned at clock cycle {clk}"
     )]
diff --git a/prover/Cargo.toml b/prover/Cargo.toml
index bce4cfe7d3..7f34d2c01a 100644
--- a/prover/Cargo.toml
+++ b/prover/Cargo.toml
@@ -17,7 +17,8 @@ edition.workspace = true
 async = ["winter-maybe-async/async"]
 concurrent = ["processor/concurrent", "std", "winter-prover/concurrent"]
 default = ["std"]
-metal = ["dep:miden-gpu", "dep:elsa", "dep:pollster", "concurrent", "std"]
+metal = ["miden-gpu/metal", "dep:elsa", "dep:pollster", "concurrent"]
+cuda = ["miden-gpu/cuda", "concurrent"]
 std = ["air/std", "processor/std", "winter-prover/std"]
 
 [dependencies]
@@ -29,5 +30,11 @@ winter-prover = { package = "winter-prover", version = "0.11", default-features
 
 [target.'cfg(all(target_arch = "aarch64", target_os = "macos"))'.dependencies]
 elsa = { version = "1.9", optional = true }
-miden-gpu = { version = "0.4", optional = true }
+miden-gpu = { path = "../../miden-gpu", optional = true }
 pollster = { version = "0.4", optional = true }
+
+[target.'cfg(target_arch = "x86_64")'.dependencies]
+miden-gpu = { path = "../../miden-gpu", optional = true }
+
+[dev-dependencies]
+serial_test = "3.1"
diff --git a/prover/README.md b/prover/README.md
index b1465b21ac..8675d0df9f 100644
--- a/prover/README.md
+++ b/prover/README.md
@@ -18,7 +18,7 @@ If the program is executed successfully, the function returns a tuple with 2 ele
 Here is a simple example of executing a program which pushes two numbers onto the stack and computes their sum:
 ```Rust
 use miden_assembly::Assembler;
-use miden_prover::{prove, ProvingOptions, StackInputs, DefaultHost};
+use miden_prover::{Prover, ProvingOptions, StackInputs, DefaultHost};
 
 // instantiate the assembler
 let assembler = Assembler::default();
@@ -27,7 +27,8 @@ let assembler = Assembler::default();
 let program = assembler.compile("begin push.3 push.5 add end").unwrap();
 
 // let's execute it and generate a STARK proof
-let (outputs, proof) = prove(
+let mut prover = Prover::new();
+let (outputs, proof) = prover.prove(
     &program,
     StackInputs::default(),       // we won't provide any stack inputs
     &mut DefaultHost::default(),  // we'll be using a default host
@@ -45,6 +46,7 @@ Miden prover can be compiled with the following features:
 * `std` - enabled by default and relies on the Rust standard library.
 * `concurrent` - implies `std` and also enables multi-threaded proof generation.
 * `metal` - enables [Metal](https://en.wikipedia.org/wiki/Metal_(API))-based acceleration of proof generation (for recursive proofs) on supported platforms (e.g., Apple silicon).
+* `cuda` - enables [Cuda](https://en.wikipedia.org/wiki/CUDA)-based acceleration of proof generation (for recursive proofs) on supported platforms.
 * `no_std` does not rely on the Rust standard library and enables compilation to WebAssembly.
     * Only the `wasm32-unknown-unknown` and `wasm32-wasip1` targets are officially supported.
 
diff --git a/prover/src/gpu/cuda/mod.rs b/prover/src/gpu/cuda/mod.rs
new file mode 100644
index 0000000000..6fa1738078
--- /dev/null
+++ b/prover/src/gpu/cuda/mod.rs
@@ -0,0 +1,158 @@
+//! This module contains GPU acceleration logic for Nvidia CUDA devices.
+
+use std::{cell::RefCell, marker::PhantomData};
+
+use air::{AuxRandElements, PartitionOptions};
+use miden_gpu::{
+    cuda::{constraints::CudaConstraintCommitment, merkle::MerkleTree, trace_lde::CudaTraceLde},
+    HashFn,
+};
+use processor::crypto::{ElementHasher, Hasher};
+use winter_prover::{
+    crypto::Digest, matrix::ColMatrix, CompositionPoly, CompositionPolyTrace,
+    ConstraintCompositionCoefficients, DefaultConstraintEvaluator, Prover, StarkDomain, TraceInfo,
+    TracePolyTable,
+};
+
+use crate::{
+    crypto::{RandomCoin, Rpo256},
+    ExecutionProver, ExecutionTrace, Felt, FieldElement, ProcessorAir, PublicInputs,
+    WinterProofOptions,
+};
+
+#[cfg(test)]
+mod tests;
+
+// CONSTANTS
+// ================================================================================================
+
+const DIGEST_SIZE: usize = Rpo256::DIGEST_RANGE.end - Rpo256::DIGEST_RANGE.start;
+
+// CUDA RPO/RPX PROVER
+// ================================================================================================
+
+/// Wraps an [ExecutionProver] and provides GPU acceleration for building trace commitments.
+pub(crate) struct CudaExecutionProver<'g, H, D, R>
+where
+    H: Hasher<Digest = D> + ElementHasher<BaseField = R::BaseField>,
+    D: Digest + From<[Felt; DIGEST_SIZE]> + Into<[Felt; DIGEST_SIZE]>,
+    R: RandomCoin<BaseField = Felt, Hasher = H> + Send,
+{
+    main: RefCell<&'g mut [Felt]>,
+    aux: RefCell<&'g mut [Felt]>,
+    ce: RefCell<&'g mut [Felt]>,
+
+    pub execution_prover: ExecutionProver<H, R>,
+    pub hash_fn: HashFn,
+    phantom_data: PhantomData<D>,
+}
+
+impl<'g, H, D, R> CudaExecutionProver<'g, H, D, R>
+where
+    H: Hasher<Digest = D> + ElementHasher<BaseField = R::BaseField>,
+    D: Digest + From<[Felt; DIGEST_SIZE]> + Into<[Felt; DIGEST_SIZE]>,
+    R: RandomCoin<BaseField = Felt, Hasher = H> + Send,
+{
+    pub fn new(
+        execution_prover: ExecutionProver<H, R>,
+        hash_fn: HashFn,
+        main: &'g mut [Felt],
+        aux: &'g mut [Felt],
+        ce: &'g mut [Felt],
+    ) -> Self {
+        CudaExecutionProver {
+            main: RefCell::new(main),
+            aux: RefCell::new(aux),
+            ce: RefCell::new(ce),
+            execution_prover,
+            hash_fn,
+            phantom_data: PhantomData,
+        }
+    }
+}
+
+impl<'g, H, D, R> Prover for CudaExecutionProver<'g, H, D, R>
+where
+    H: Hasher<Digest = D> + ElementHasher<BaseField = R::BaseField> + Send + Sync,
+    D: Digest + From<[Felt; DIGEST_SIZE]> + Into<[Felt; DIGEST_SIZE]>,
+    R: RandomCoin<BaseField = Felt, Hasher = H> + Send,
+{
+    type BaseField = Felt;
+    type Air = ProcessorAir;
+    type Trace = ExecutionTrace;
+    type VC = MerkleTree<'g, Self::HashFn>;
+    type HashFn = H;
+    type RandomCoin = R;
+    type TraceLde<E: FieldElement<BaseField = Felt>> = CudaTraceLde<'g, E, H>;
+    type ConstraintCommitment<E: FieldElement<BaseField = Felt>> =
+        CudaConstraintCommitment<'g, E, H>;
+    type ConstraintEvaluator<'a, E: FieldElement<BaseField = Felt>> =
+        DefaultConstraintEvaluator<'a, ProcessorAir, E>;
+
+    fn get_pub_inputs(&self, trace: &ExecutionTrace) -> PublicInputs {
+        self.execution_prover.get_pub_inputs(trace)
+    }
+
+    fn options(&self) -> &WinterProofOptions {
+        self.execution_prover.options()
+    }
+
+    fn new_trace_lde<E: FieldElement<BaseField = Felt>>(
+        &self,
+        trace_info: &TraceInfo,
+        main_trace: &ColMatrix<Felt>,
+        domain: &StarkDomain<Felt>,
+        partition_options: PartitionOptions,
+    ) -> (Self::TraceLde<E>, TracePolyTable<E>) {
+        CudaTraceLde::new(
+            self.main.take(),
+            self.aux.take(),
+            trace_info,
+            main_trace,
+            domain,
+            partition_options,
+            self.hash_fn,
+        )
+    }
+
+    fn new_evaluator<'a, E: FieldElement<BaseField = Felt>>(
+        &self,
+        air: &'a ProcessorAir,
+        aux_rand_elements: Option<AuxRandElements<E>>,
+        composition_coefficients: ConstraintCompositionCoefficients<E>,
+    ) -> Self::ConstraintEvaluator<'a, E> {
+        self.execution_prover
+            .new_evaluator(air, aux_rand_elements, composition_coefficients)
+    }
+
+    fn build_aux_trace<E>(
+        &self,
+        main_trace: &Self::Trace,
+        aux_rand_elements: &AuxRandElements<E>,
+    ) -> ColMatrix<E>
+    where
+        E: FieldElement<BaseField = Self::BaseField>,
+    {
+        self.execution_prover.build_aux_trace(main_trace, aux_rand_elements)
+    }
+
+    fn build_constraint_commitment<E>(
+        &self,
+        composition_poly_trace: CompositionPolyTrace<E>,
+        num_constraint_composition_columns: usize,
+        domain: &StarkDomain<Self::BaseField>,
+        partition_options: PartitionOptions,
+    ) -> (Self::ConstraintCommitment<E>, CompositionPoly<E>)
+    where
+        E: FieldElement<BaseField = Self::BaseField>,
+    {
+        CudaConstraintCommitment::new(
+            self.ce.take(),
+            composition_poly_trace,
+            num_constraint_composition_columns,
+            domain,
+            partition_options,
+            self.hash_fn,
+        )
+    }
+}
diff --git a/prover/src/gpu/cuda/tests.rs b/prover/src/gpu/cuda/tests.rs
new file mode 100644
index 0000000000..7b327df287
--- /dev/null
+++ b/prover/src/gpu/cuda/tests.rs
@@ -0,0 +1,289 @@
+use alloc::vec::Vec;
+use std::vec;
+
+use air::{ProvingOptions, StarkField};
+use miden_gpu::{cuda::alloc::alloc_pinned, HashFn, DIGEST_SIZE, RATE};
+use processor::{
+    crypto::{Hasher, RpoDigest, RpoRandomCoin, Rpx256, RpxDigest, RpxRandomCoin},
+    math::fft,
+    StackInputs, StackOutputs,
+};
+use serial_test::serial;
+use winter_prover::{crypto::Digest, CompositionPolyTrace, ConstraintCommitment, TraceLde};
+
+use crate::{gpu::cuda::CudaExecutionProver, *};
+
+fn build_trace_commitment_on_gpu_with_padding_matches_cpu<
+    R: RandomCoin<BaseField = Felt, Hasher = H> + Send,
+    H: ElementHasher<BaseField = Felt> + Hasher<Digest = D> + Send + Sync,
+    D: Digest + From<[Felt; DIGEST_SIZE]> + Into<[Felt; DIGEST_SIZE]>,
+>(
+    hash_fn: HashFn,
+) {
+    let is_rpx = matches!(hash_fn, HashFn::Rpx256);
+
+    let num_rows = 1 << 8;
+    let trace_info = get_trace_info(1, num_rows);
+    let trace = gen_random_trace(num_rows, RATE + 1);
+    let domain = StarkDomain::from_twiddles(fft::get_twiddles(num_rows), 8, Felt::GENERATOR);
+    let partition_options = PartitionOptions::new(1, 8);
+
+    let mut main = alloc_pinned(102400, Felt::ZERO);
+    let mut aux = alloc_pinned(102400, Felt::ZERO);
+    let mut ce = alloc_pinned(102400, Felt::ZERO);
+    let gpu_prover = CudaExecutionProver::new(
+        create_test_prover::<R, H>(is_rpx),
+        hash_fn,
+        &mut main,
+        &mut aux,
+        &mut ce,
+    );
+    let cpu_prover = create_test_prover::<R, H>(is_rpx);
+
+    let (cpu_trace_lde, cpu_polys) =
+        cpu_prover.new_trace_lde::<Felt>(&trace_info, &trace, &domain, partition_options);
+    let (gpu_trace_lde, gpu_polys) =
+        gpu_prover.new_trace_lde::<Felt>(&trace_info, &trace, &domain, partition_options);
+
+    assert_eq!(
+        cpu_trace_lde.get_main_trace_commitment(),
+        gpu_trace_lde.get_main_trace_commitment()
+    );
+    assert_eq!(
+        cpu_polys.main_trace_polys().collect::<Vec<_>>(),
+        gpu_polys.main_trace_polys().collect::<Vec<_>>()
+    );
+}
+
+fn build_trace_commitment_on_gpu_without_padding_matches_cpu<
+    R: RandomCoin<BaseField = Felt, Hasher = H> + Send,
+    H: ElementHasher<BaseField = Felt> + Hasher<Digest = D> + Send + Sync,
+    D: Digest + From<[Felt; DIGEST_SIZE]> + Into<[Felt; DIGEST_SIZE]>,
+>(
+    hash_fn: HashFn,
+) {
+    let is_rpx = matches!(hash_fn, HashFn::Rpx256);
+
+    let num_rows = 1 << 8;
+    let trace_info = get_trace_info(8, num_rows);
+    let trace = gen_random_trace(num_rows, RATE);
+    let domain = StarkDomain::from_twiddles(fft::get_twiddles(num_rows), 8, Felt::GENERATOR);
+    let partition_options = PartitionOptions::new(1, 8);
+
+    let mut main = alloc_pinned(102400, Felt::ZERO);
+    let mut aux = alloc_pinned(102400, Felt::ZERO);
+    let mut ce = alloc_pinned(102400, Felt::ZERO);
+    let gpu_prover = CudaExecutionProver::new(
+        create_test_prover::<R, H>(is_rpx),
+        hash_fn,
+        &mut main,
+        &mut aux,
+        &mut ce,
+    );
+    let cpu_prover = create_test_prover::<R, H>(is_rpx);
+
+    let (cpu_trace_lde, cpu_polys) =
+        cpu_prover.new_trace_lde::<Felt>(&trace_info, &trace, &domain, partition_options);
+    let (gpu_trace_lde, gpu_polys) =
+        gpu_prover.new_trace_lde::<Felt>(&trace_info, &trace, &domain, partition_options);
+
+    assert_eq!(
+        cpu_trace_lde.get_main_trace_commitment(),
+        gpu_trace_lde.get_main_trace_commitment()
+    );
+    assert_eq!(
+        cpu_polys.main_trace_polys().collect::<Vec<_>>(),
+        gpu_polys.main_trace_polys().collect::<Vec<_>>()
+    );
+}
+
+fn build_constraint_commitment_on_gpu_with_padding_matches_cpu<
+    R: RandomCoin<BaseField = Felt, Hasher = H> + Send,
+    H: ElementHasher<BaseField = Felt> + Hasher<Digest = D> + Send + Sync,
+    D: Digest + From<[Felt; DIGEST_SIZE]> + Into<[Felt; DIGEST_SIZE]>,
+>(
+    hash_fn: HashFn,
+) {
+    let is_rpx = matches!(hash_fn, HashFn::Rpx256);
+
+    let num_rows = 1 << 8;
+    let ce_blowup_factor = 2;
+    let values = get_random_values::<Felt>(num_rows * ce_blowup_factor);
+    let domain = StarkDomain::from_twiddles(fft::get_twiddles(num_rows), 8, Felt::GENERATOR);
+    let partition_options = PartitionOptions::new(1, 8);
+
+    let mut main = alloc_pinned(102400, Felt::ZERO);
+    let mut aux = alloc_pinned(102400, Felt::ZERO);
+    let mut ce = alloc_pinned(102400, Felt::ZERO);
+    let gpu_prover = CudaExecutionProver::new(
+        create_test_prover::<R, H>(is_rpx),
+        hash_fn,
+        &mut main,
+        &mut aux,
+        &mut ce,
+    );
+    let cpu_prover = create_test_prover::<R, H>(is_rpx);
+
+    let (commitment_cpu, composition_poly_cpu) = cpu_prover.build_constraint_commitment(
+        CompositionPolyTrace::new(values.clone()),
+        2,
+        &domain,
+        partition_options,
+    );
+    let (commitment_gpu, composition_poly_gpu) = gpu_prover.build_constraint_commitment(
+        CompositionPolyTrace::new(values),
+        2,
+        &domain,
+        partition_options,
+    );
+
+    assert_eq!(commitment_cpu.commitment(), commitment_gpu.commitment());
+    assert_ne!(0, composition_poly_cpu.data().num_base_cols() % RATE);
+    assert_eq!(composition_poly_cpu.into_columns(), composition_poly_gpu.into_columns());
+}
+
+fn build_constraint_commitment_on_gpu_without_padding_matches_cpu<
+    R: RandomCoin<BaseField = Felt, Hasher = H> + Send,
+    H: ElementHasher<BaseField = Felt> + Hasher<Digest = D> + Send + Sync,
+    D: Digest + From<[Felt; DIGEST_SIZE]> + Into<[Felt; DIGEST_SIZE]>,
+>(
+    hash_fn: HashFn,
+) {
+    let is_rpx = matches!(hash_fn, HashFn::Rpx256);
+
+    let num_rows = 1 << 8;
+    let ce_blowup_factor = 8;
+    let values = get_random_values::<Felt>(num_rows * ce_blowup_factor);
+    let domain = StarkDomain::from_twiddles(fft::get_twiddles(num_rows), 8, Felt::GENERATOR);
+    let partition_options = PartitionOptions::new(1, 8);
+
+    let mut main = alloc_pinned(102400, Felt::ZERO);
+    let mut aux = alloc_pinned(102400, Felt::ZERO);
+    let mut ce = alloc_pinned(102400, Felt::ZERO);
+    let gpu_prover = CudaExecutionProver::new(
+        create_test_prover::<R, H>(is_rpx),
+        hash_fn,
+        &mut main,
+        &mut aux,
+        &mut ce,
+    );
+    let cpu_prover = create_test_prover::<R, H>(is_rpx);
+
+    let (commitment_cpu, composition_poly_cpu) = cpu_prover.build_constraint_commitment(
+        CompositionPolyTrace::new(values.clone()),
+        8,
+        &domain,
+        partition_options,
+    );
+    let (commitment_gpu, composition_poly_gpu) = gpu_prover.build_constraint_commitment(
+        CompositionPolyTrace::new(values),
+        8,
+        &domain,
+        partition_options,
+    );
+
+    assert_eq!(commitment_cpu.commitment(), commitment_gpu.commitment());
+    assert_eq!(0, composition_poly_cpu.data().num_base_cols() % RATE);
+    assert_eq!(composition_poly_cpu.into_columns(), composition_poly_gpu.into_columns());
+}
+
+#[test]
+#[serial]
+fn rpo_build_trace_commitment_on_gpu_with_padding_matches_cpu() {
+    build_trace_commitment_on_gpu_with_padding_matches_cpu::<RpoRandomCoin, Rpo256, RpoDigest>(
+        HashFn::Rpo256,
+    );
+}
+
+#[test]
+#[serial]
+fn rpx_build_trace_commitment_on_gpu_with_padding_matches_cpu() {
+    build_trace_commitment_on_gpu_with_padding_matches_cpu::<RpxRandomCoin, Rpx256, RpxDigest>(
+        HashFn::Rpx256,
+    );
+}
+
+#[test]
+#[serial]
+fn rpo_build_trace_commitment_on_gpu_without_padding_matches_cpu() {
+    build_trace_commitment_on_gpu_without_padding_matches_cpu::<RpoRandomCoin, Rpo256, RpoDigest>(
+        HashFn::Rpo256,
+    );
+}
+
+#[test]
+#[serial]
+fn rpx_build_trace_commitment_on_gpu_without_padding_matches_cpu() {
+    build_trace_commitment_on_gpu_without_padding_matches_cpu::<RpxRandomCoin, Rpx256, RpxDigest>(
+        HashFn::Rpx256,
+    );
+}
+
+#[test]
+#[serial]
+fn rpo_build_constraint_commitment_on_gpu_with_padding_matches_cpu() {
+    build_constraint_commitment_on_gpu_with_padding_matches_cpu::<RpoRandomCoin, Rpo256, RpoDigest>(
+        HashFn::Rpo256,
+    );
+}
+
+#[test]
+#[serial]
+fn rpx_build_constraint_commitment_on_gpu_with_padding_matches_cpu() {
+    build_constraint_commitment_on_gpu_with_padding_matches_cpu::<RpxRandomCoin, Rpx256, RpxDigest>(
+        HashFn::Rpx256,
+    );
+}
+
+#[test]
+#[serial]
+fn rpo_build_constraint_commitment_on_gpu_without_padding_matches_cpu() {
+    build_constraint_commitment_on_gpu_without_padding_matches_cpu::<
+        RpoRandomCoin,
+        Rpo256,
+        RpoDigest,
+    >(HashFn::Rpo256);
+}
+
+#[test]
+#[serial]
+fn rpx_build_constraint_commitment_on_gpu_without_padding_matches_cpu() {
+    build_constraint_commitment_on_gpu_without_padding_matches_cpu::<
+        RpxRandomCoin,
+        Rpx256,
+        RpxDigest,
+    >(HashFn::Rpx256);
+}
+
+fn gen_random_trace(num_rows: usize, num_cols: usize) -> ColMatrix<Felt> {
+    ColMatrix::new((0..num_cols as u64).map(|col| vec![Felt::new(col); num_rows]).collect())
+}
+
+fn get_random_values<E: FieldElement>(num_rows: usize) -> Vec<E> {
+    (0..num_rows).map(|i| E::from(i as u32)).collect()
+}
+
+fn get_trace_info(num_cols: usize, num_rows: usize) -> TraceInfo {
+    TraceInfo::new(num_cols, num_rows)
+}
+
+fn create_test_prover<
+    R: RandomCoin<BaseField = Felt, Hasher = H> + Send,
+    H: ElementHasher<BaseField = Felt>,
+>(
+    use_rpx: bool,
+) -> ExecutionProver<H, R> {
+    if use_rpx {
+        ExecutionProver::new(
+            ProvingOptions::with_128_bit_security_rpx(),
+            StackInputs::default(),
+            StackOutputs::default(),
+        )
+    } else {
+        ExecutionProver::new(
+            ProvingOptions::with_128_bit_security(true),
+            StackInputs::default(),
+            StackOutputs::default(),
+        )
+    }
+}
diff --git a/prover/src/gpu/mod.rs b/prover/src/gpu/mod.rs
index 253db30f7c..4eba753953 100644
--- a/prover/src/gpu/mod.rs
+++ b/prover/src/gpu/mod.rs
@@ -1,2 +1,5 @@
 #[cfg(all(feature = "metal", target_arch = "aarch64", target_os = "macos"))]
 pub mod metal;
+
+#[cfg(all(feature = "cuda", target_arch = "x86_64"))]
+pub mod cuda;
diff --git a/prover/src/lib.rs b/prover/src/lib.rs
index 25fb837dbd..ba0ded8615 100644
--- a/prover/src/lib.rs
+++ b/prover/src/lib.rs
@@ -7,9 +7,18 @@ extern crate alloc;
 extern crate std;
 
 use core::marker::PhantomData;
+use std::env;
 
-use air::{AuxRandElements, PartitionOptions, ProcessorAir, PublicInputs};
-#[cfg(all(feature = "metal", target_arch = "aarch64", target_os = "macos"))]
+use air::{
+    trace::{AUX_TRACE_WIDTH, TRACE_WIDTH},
+    AuxRandElements, PartitionOptions, ProcessorAir, PublicInputs,
+};
+#[cfg(all(target_arch = "x86_64", feature = "cuda"))]
+use miden_gpu::cuda::util::CudaStorageOwned;
+#[cfg(any(
+    all(feature = "metal", target_arch = "aarch64", target_os = "macos"),
+    all(feature = "cuda", target_arch = "x86_64")
+))]
 use miden_gpu::HashFn;
 use processor::{
     crypto::{
@@ -24,7 +33,8 @@ use winter_maybe_async::{maybe_async, maybe_await};
 use winter_prover::{
     matrix::ColMatrix, CompositionPoly, CompositionPolyTrace, ConstraintCompositionCoefficients,
     DefaultConstraintCommitment, DefaultConstraintEvaluator, DefaultTraceLde,
-    ProofOptions as WinterProofOptions, Prover, StarkDomain, TraceInfo, TracePolyTable,
+    ProofOptions as WinterProofOptions, Prover as WinterProver, StarkDomain, TraceInfo,
+    TracePolyTable,
 };
 #[cfg(feature = "std")]
 use {std::time::Instant, winter_prover::Trace};
@@ -34,6 +44,10 @@ mod gpu;
 // ================================================================================================
 
 pub use air::{DeserializationError, ExecutionProof, FieldExtension, HashFunction, ProvingOptions};
+#[cfg(all(target_arch = "x86_64", feature = "cuda"))]
+pub mod cuda {
+    pub use miden_gpu::cuda::get_num_of_gpus;
+}
 pub use processor::{
     crypto, math, utils, AdviceInputs, Digest, ExecutionError, Host, InputError, MemAdviceProvider,
     StackInputs, StackOutputs, Word,
@@ -43,85 +57,130 @@ pub use winter_prover::{crypto::MerkleTree as MerkleTreeVC, Proof};
 // PROVER
 // ================================================================================================
 
-/// Executes and proves the specified `program` and returns the result together with a STARK-based
-/// proof of the program's execution.
-///
-/// - `stack_inputs` specifies the initial state of the stack for the VM.
-/// - `host` specifies the host environment which contain non-deterministic (secret) inputs for the
-///   prover
-/// - `options` defines parameters for STARK proof generation.
+/// Program executor and a STARK proover.
 ///
-/// # Errors
-/// Returns an error if program execution or STARK proof generation fails for any reason.
-#[instrument("prove_program", skip_all)]
-#[maybe_async]
-pub fn prove(
-    program: &Program,
-    stack_inputs: StackInputs,
-    host: &mut impl Host,
-    options: ProvingOptions,
-) -> Result<(StackOutputs, ExecutionProof), ExecutionError> {
-    // execute the program to create an execution trace
-    #[cfg(feature = "std")]
-    let now = Instant::now();
-    let trace =
-        processor::execute(program, stack_inputs.clone(), host, *options.execution_options())?;
-    #[cfg(feature = "std")]
-    tracing::event!(
-        tracing::Level::INFO,
-        "Generated execution trace of {} columns and {} steps ({}% padded) in {} ms",
-        trace.info().main_trace_width(),
-        trace.trace_len_summary().padded_trace_len(),
-        trace.trace_len_summary().padding_percentage(),
-        now.elapsed().as_millis()
-    );
+/// It allows concrete implementations of hardware-accelerated provers to store reusable buffers
+/// for better performance.
+pub struct Prover {
+    #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
+    storage: CudaStorageOwned,
+}
 
-    let stack_outputs = trace.stack_outputs().clone();
-    let hash_fn = options.hash_fn();
+impl Prover {
+    #[cfg(not(all(feature = "cuda", target_arch = "x86_64")))]
+    pub fn new() -> Self {
+        Self {}
+    }
 
-    // generate STARK proof
-    let proof = match hash_fn {
-        HashFunction::Blake3_192 => {
-            let prover = ExecutionProver::<Blake3_192, WinterRandomCoin<_>>::new(
-                options,
-                stack_inputs,
-                stack_outputs.clone(),
-            );
-            maybe_await!(prover.prove(trace))
-        },
-        HashFunction::Blake3_256 => {
-            let prover = ExecutionProver::<Blake3_256, WinterRandomCoin<_>>::new(
-                options,
-                stack_inputs,
-                stack_outputs.clone(),
-            );
-            maybe_await!(prover.prove(trace))
-        },
-        HashFunction::Rpo256 => {
-            let prover = ExecutionProver::<Rpo256, RpoRandomCoin>::new(
-                options,
-                stack_inputs,
-                stack_outputs.clone(),
-            );
-            #[cfg(all(feature = "metal", target_arch = "aarch64", target_os = "macos"))]
-            let prover = gpu::metal::MetalExecutionProver::new(prover, HashFn::Rpo256);
-            maybe_await!(prover.prove(trace))
-        },
-        HashFunction::Rpx256 => {
-            let prover = ExecutionProver::<Rpx256, RpxRandomCoin>::new(
-                options,
-                stack_inputs,
-                stack_outputs.clone(),
-            );
-            #[cfg(all(feature = "metal", target_arch = "aarch64", target_os = "macos"))]
-            let prover = gpu::metal::MetalExecutionProver::new(prover, HashFn::Rpx256);
-            maybe_await!(prover.prove(trace))
-        },
+    #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
+    #[instrument("create_prover", skip_all)]
+    pub fn new() -> Self {
+        let buffer_size = env::var("MIDEN_CUDA_MEM_SIZE_MB")
+            .expect("Provide MIDEN_CUDA_MEM_SIZE_MB env variable to specify the max CUDA buffer size (in megabytes)")
+            .parse::<usize>()
+            .expect("MIDEN_CUDA_MEM_SIZE_MB must be a number");
+        Self {
+            storage: CudaStorageOwned::new(1024 * 1024 * buffer_size),
+        }
     }
-    .map_err(ExecutionError::ProverError)?;
-    let proof = ExecutionProof::new(proof, hash_fn);
 
-    Ok((stack_outputs, proof))
+    /// Executes and proves the specified `program` and returns the result together with a
+    /// STARK-based proof of the program's execution.
+    ///
+    /// - `stack_inputs` specifies the initial state of the stack for the VM.
+    /// - `host` specifies the host environment which contain non-deterministic (secret) inputs for
+    ///   the prover
+    /// - `options` defines parameters for STARK proof generation.
+    ///
+    /// # Errors
+    /// Returns an error if program execution or STARK proof generation fails for any reason.
+    #[instrument("prove_program", skip_all)]
+    #[maybe_async]
+    pub fn prove(
+        &mut self,
+        program: &Program,
+        stack_inputs: StackInputs,
+        host: &mut impl Host,
+        options: ProvingOptions,
+    ) -> Result<(StackOutputs, ExecutionProof), ExecutionError> {
+        // execute the program to create an execution trace
+        #[cfg(feature = "std")]
+        let now = Instant::now();
+        let trace =
+            processor::execute(program, stack_inputs.clone(), host, *options.execution_options())?;
+        #[cfg(feature = "std")]
+        tracing::event!(
+            tracing::Level::INFO,
+            "Generated execution trace of {} columns and {} steps ({}% padded) in {} ms",
+            trace.info().main_trace_width(),
+            trace.trace_len_summary().padded_trace_len(),
+            trace.trace_len_summary().padding_percentage(),
+            now.elapsed().as_millis()
+        );
+
+        let stack_outputs = trace.stack_outputs().clone();
+        let hash_fn = options.hash_fn();
+
+        #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
+        let Some((main, aux, ce)) = self.storage.borrow_mut(
+            TRACE_WIDTH,
+            AUX_TRACE_WIDTH,
+            trace.get_trace_len(),
+            options.clone().into(),
+        ) else {
+            return Err(ExecutionError::MemoryLimitExceeded(self.storage.capacity()));
+        };
+
+        // generate STARK proof
+        let proof = match hash_fn {
+            HashFunction::Blake3_192 => {
+                let prover = ExecutionProver::<Blake3_192, WinterRandomCoin<_>>::new(
+                    options,
+                    stack_inputs,
+                    stack_outputs.clone(),
+                );
+                maybe_await!(prover.prove(trace))
+            },
+            HashFunction::Blake3_256 => {
+                let prover = ExecutionProver::<Blake3_256, WinterRandomCoin<_>>::new(
+                    options,
+                    stack_inputs,
+                    stack_outputs.clone(),
+                );
+                maybe_await!(prover.prove(trace))
+            },
+            HashFunction::Rpo256 => {
+                let prover = ExecutionProver::<Rpo256, RpoRandomCoin>::new(
+                    options,
+                    stack_inputs,
+                    stack_outputs.clone(),
+                );
+                #[cfg(all(feature = "metal", target_arch = "aarch64", target_os = "macos"))]
+                let prover = gpu::metal::MetalExecutionProver::new(prover, HashFn::Rpo256);
+                #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
+                let prover =
+                    gpu::cuda::CudaExecutionProver::new(prover, HashFn::Rpo256, main, aux, ce);
+                maybe_await!(prover.prove(trace))
+            },
+            HashFunction::Rpx256 => {
+                let prover = ExecutionProver::<Rpx256, RpxRandomCoin>::new(
+                    options,
+                    stack_inputs,
+                    stack_outputs.clone(),
+                );
+                #[cfg(all(feature = "metal", target_arch = "aarch64", target_os = "macos"))]
+                let prover = gpu::metal::MetalExecutionProver::new(prover, HashFn::Rpx256);
+                #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
+                let prover =
+                    gpu::cuda::CudaExecutionProver::new(prover, HashFn::Rpx256, main, aux, ce);
+                maybe_await!(prover.prove(trace))
+            },
+        }
+        .map_err(ExecutionError::ProverError)?;
+        let proof = ExecutionProof::new(proof, hash_fn);
+
+        Ok((stack_outputs, proof))
+    }
 }
 
 // PROVER
@@ -176,7 +235,7 @@ where
     }
 }
 
-impl<H, R> Prover for ExecutionProver<H, R>
+impl<H, R> WinterProver for ExecutionProver<H, R>
 where
     H: ElementHasher<BaseField = Felt> + Sync,
     R: RandomCoin<BaseField = Felt, Hasher = H> + Send,
diff --git a/test-utils/src/lib.rs b/test-utils/src/lib.rs
index f5fd3ba85f..2d45bc287c 100644
--- a/test-utils/src/lib.rs
+++ b/test-utils/src/lib.rs
@@ -13,6 +13,7 @@ use alloc::{
     vec::Vec,
 };
 
+use air::ExecutionProof;
 pub use assembly::{diagnostics::Report, LibraryPath, SourceFile, SourceManager};
 use assembly::{KernelLibrary, Library};
 pub use pretty_assertions::{assert_eq, assert_ne, assert_str_eq};
@@ -23,8 +24,8 @@ pub use processor::{
 };
 #[cfg(not(target_family = "wasm"))]
 use proptest::prelude::{Arbitrary, Strategy};
-use prover::utils::range;
-pub use prover::{prove, MemAdviceProvider, MerkleTreeVC, ProvingOptions};
+use prover::{utils::range, Host};
+pub use prover::{MemAdviceProvider, MerkleTreeVC, Prover, ProvingOptions};
 pub use test_case::test_case;
 pub use verifier::{verify, AcceptableOptions, VerifierError};
 use vm_core::{chiplets::hasher::apply_permutation, ProgramInfo};
@@ -82,6 +83,19 @@ proc.truncate_stack.4
 end
 ";
 
+// HELPERS
+// ================================================================================================
+
+pub fn prove(
+    program: &Program,
+    stack_inputs: StackInputs,
+    host: &mut impl Host,
+    options: ProvingOptions,
+) -> Result<(StackOutputs, ExecutionProof), ExecutionError> {
+    let mut prover = Prover::new();
+    prover.prove(program, stack_inputs, host, options)
+}
+
 // TEST HANDLER
 // ================================================================================================
 
@@ -379,8 +393,7 @@ impl Test {
             host.load_mast_forest(library.mast_forest().clone()).unwrap();
         }
         let (mut stack_outputs, proof) =
-            prover::prove(&program, stack_inputs.clone(), &mut host, ProvingOptions::default())
-                .unwrap();
+            prove(&program, stack_inputs.clone(), &mut host, ProvingOptions::default()).unwrap();
 
         let program_info = ProgramInfo::from(program);
         if test_fail {
diff --git a/verifier/src/lib.rs b/verifier/src/lib.rs
index 03962b6070..7b367f844d 100644
--- a/verifier/src/lib.rs
+++ b/verifier/src/lib.rs
@@ -83,6 +83,12 @@ pub fn verify(
             let opts = AcceptableOptions::OptionSet(vec![
                 ProvingOptions::RECURSIVE_96_BITS,
                 ProvingOptions::RECURSIVE_128_BITS,
+                ProvingOptions::RECURSIVE_96_BITS.with_partitions(1, 8),
+                ProvingOptions::RECURSIVE_128_BITS.with_partitions(1, 8),
+                ProvingOptions::RECURSIVE_96_BITS.with_partitions(2, 8),
+                ProvingOptions::RECURSIVE_128_BITS.with_partitions(2, 8),
+                ProvingOptions::RECURSIVE_96_BITS.with_partitions(4, 8),
+                ProvingOptions::RECURSIVE_128_BITS.with_partitions(4, 8),
             ]);
             verify_proof::<ProcessorAir, Rpo256, RpoRandomCoin, MerkleTree<_>>(
                 proof, pub_inputs, &opts,
@@ -92,6 +98,12 @@ pub fn verify(
             let opts = AcceptableOptions::OptionSet(vec![
                 ProvingOptions::RECURSIVE_96_BITS,
                 ProvingOptions::RECURSIVE_128_BITS,
+                ProvingOptions::RECURSIVE_96_BITS.with_partitions(1, 8),
+                ProvingOptions::RECURSIVE_128_BITS.with_partitions(1, 8),
+                ProvingOptions::RECURSIVE_96_BITS.with_partitions(2, 8),
+                ProvingOptions::RECURSIVE_128_BITS.with_partitions(2, 8),
+                ProvingOptions::RECURSIVE_96_BITS.with_partitions(4, 8),
+                ProvingOptions::RECURSIVE_128_BITS.with_partitions(4, 8),
             ]);
             verify_proof::<ProcessorAir, Rpx256, RpxRandomCoin, MerkleTree<_>>(
                 proof, pub_inputs, &opts,