diff --git a/Cargo.lock b/Cargo.lock index dd674229d4..d69f95ed83 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -26,6 +26,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45862d1c77f2228b9e10bc609d5bc203d86ebc9b87ad8d5d5167a6c9abf739d9" + [[package]] name = "anes" version = "0.1.6" @@ -669,6 +675,7 @@ checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" dependencies = [ "futures-channel", "futures-core", + "futures-executor", "futures-io", "futures-sink", "futures-task", @@ -691,6 +698,17 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" +[[package]] +name = "futures-executor" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + [[package]] name = "futures-io" version = "0.3.31" @@ -732,10 +750,13 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ + "futures-channel", "futures-core", + "futures-io", "futures-macro", "futures-sink", "futures-task", + "memchr", "pin-project-lite", "pin-utils", "slab", @@ -821,6 +842,15 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "home" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "indenter" version = "0.3.3" @@ -1139,12 +1169,20 @@ dependencies = [ [[package]] name = "miden-gpu" version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04742d5184bd76e7b92afcd9ca36b7c587a555d5fc4b27bb8f9d64ac57151f9c" dependencies = [ + "allocator-api2", + "cc", "metal", + "miden-crypto", "once_cell", + "rayon", + "sppark", + "thiserror 2.0.11", + "tracing", + "winter-air", "winter-math", + "winter-prover", + "winterfell", ] [[package]] @@ -1226,6 +1264,7 @@ dependencies = [ "miden-gpu", "miden-processor", "pollster", + "serial_test", "tracing", "winter-maybe-async", "winter-prover", @@ -1942,6 +1981,15 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "scc" +version = "2.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66b202022bb57c049555430e11fc22fea12909276a80a4c3d368da36ac1d88ed" +dependencies = [ + "sdd", +] + [[package]] name = "scoped-tls" version = "1.0.1" @@ -1954,6 +2002,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "sdd" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49c1eeaf4b6a87c7479688c6d52b9f1153cedd3c489300564f932b065c6eab95" + [[package]] name = "semver" version = "0.9.0" @@ -2016,6 +2070,31 @@ dependencies = [ "serde", ] +[[package]] +name = "serial_test" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b258109f244e1d6891bf1053a55d63a5cd4f8f4c30cf9a1280989f80e7a1fa9" +dependencies = [ + "futures", + "log", + "once_cell", + "parking_lot", + "scc", + "serial_test_derive", +] + +[[package]] +name = "serial_test_derive" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d69265a08751de7844521fd15003ae0a888e035773ba05695c5c759a6f89eef" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "sha2" version = "0.10.8" @@ -2085,6 +2164,16 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +[[package]] +name = "sppark" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac5090642d9ae844edd9a5c23cb1fce6724ca88d50c55178ee8d1f656df5826b" +dependencies = [ + "cc", + "which", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -2618,6 +2707,18 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix", +] + [[package]] name = "winapi" version = "0.3.9" @@ -2722,6 +2823,15 @@ dependencies = [ "windows-targets 0.48.5", ] +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" version = "0.59.0" @@ -2963,6 +3073,17 @@ dependencies = [ "winter-utils", ] +[[package]] +name = "winterfell" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6bdcd01333bbf4a349d8d13f269281524bd6d1a36ae3a853187f0665bf1cfd4" +dependencies = [ + "winter-air", + "winter-prover", + "winter-verifier", +] + [[package]] name = "yansi" version = "1.0.1" diff --git a/Makefile b/Makefile index 92636ec556..5a4865b7e2 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,8 @@ DEBUG_ASSERTIONS=RUSTFLAGS="-C debug-assertions" FEATURES_CONCURRENT_EXEC=--features concurrent,executable FEATURES_LOG_TREE=--features concurrent,executable,tracing-forest FEATURES_METAL_EXEC=--features concurrent,executable,metal -ALL_FEATURES_BUT_ASYNC=--features concurrent,executable,metal,testing,with-debug-info,internal +FEATURES_CUDA_EXEC=--features concurrent,executable,cuda +ALL_FEATURES_BUT_ASYNC=--features concurrent,executable,metal,cuda,testing,with-debug-info,internal # -- linting -------------------------------------------------------------------------------------- @@ -100,6 +101,10 @@ exec-single: ## Builds a single-threaded executable exec-metal: ## Builds an executable with Metal acceleration enabled cargo build --profile optimized $(FEATURES_METAL_EXEC) +.PHONY: exec-cuda +exec-cuda: ## Builds an executable with CUDA acceleration enabled + RUSTFLAGS="-C target-feature=+avx2" cargo build --profile optimized $(FEATURES_CUDA_EXEC) + .PHONY: exec-avx2 exec-avx2: ## Builds an executable with AVX2 acceleration enabled RUSTFLAGS="-C target-feature=+avx2" cargo build --profile optimized $(FEATURES_CONCURRENT_EXEC) diff --git a/air/src/options.rs b/air/src/options.rs index bd1a5e6e56..e38f7a0bc9 100644 --- a/air/src/options.rs +++ b/air/src/options.rs @@ -129,6 +129,23 @@ impl ProvingOptions { self } + /// Sets partitions for this [ProvingOptions]. + /// + /// Partitions can be provided to split traces during proving and distribute work across + /// multiple devices. The number of partitions should be equal to the number of devices. + pub const fn with_partitions(mut self, num_partitions: usize) -> Self { + // All currently supported hash functions consume 8 felts per iteration. + // Match statement ensures that future changes to available hashes are reflected here. + let hash_rate = match self.hash_fn { + HashFunction::Blake3_192 => 8, + HashFunction::Blake3_256 => 8, + HashFunction::Rpo256 => 8, + HashFunction::Rpx256 => 8, + }; + self.proof_options = self.proof_options.with_partitions(num_partitions, hash_rate); + self + } + // PUBLIC ACCESSORS // -------------------------------------------------------------------------------------------- diff --git a/docs/src/intro/usage.md b/docs/src/intro/usage.md index 27cdb7973f..8b20770cf1 100644 --- a/docs/src/intro/usage.md +++ b/docs/src/intro/usage.md @@ -38,12 +38,20 @@ Internally, Miden VM uses [rayon](https://github.com/rayon-rs/rayon) for paralle ### GPU acceleration -Miden VM proof generation can be accelerated via GPUs. Currently, GPU acceleration is enabled only on Apple Silicon hardware (via [Metal]()). To compile Miden VM with Metal acceleration enabled, you can run the following command: +Miden VM proof generation can be accelerated via GPUs. Currently, GPU acceleration is enabled on Apple Silicon hardware (via [Metal]()) and Nvidia GPUs (via [Cuda](https://en.wikipedia.org/wiki/CUDA)). + +To compile Miden VM with Metal acceleration enabled, you can run the following command: ```shell make exec-metal ``` +To compile Miden VM with CUDA acceleration enabled, you can run the following command: + +```shell +make exec-cuda +``` + Similar to `make exec` command, this will place the resulting `miden` executable into the `./target/optimized` directory. Currently, GPU acceleration is applicable only to recursive proofs which can be generated using the `-r` flag. @@ -66,7 +74,7 @@ make exec-sve This will place the resulting `miden` executable into the `./target/optimized` directory. -Similar to Metal acceleration, SVE/AVX2 acceleration is currently applicable only to recursive proofs which can be generated using the `-r` flag. +Similar to GPU acceleration, SVE/AVX2 acceleration is currently applicable only to recursive proofs which can be generated using the `-r` flag. ### Running Miden VM diff --git a/miden/Cargo.toml b/miden/Cargo.toml index f4f9031f3c..22d4b8483a 100644 --- a/miden/Cargo.toml +++ b/miden/Cargo.toml @@ -55,6 +55,7 @@ executable = [ "dep:rustyline", "dep:tracing-subscriber", ] +cuda = ["prover/cuda", "std"] metal = ["prover/metal", "std"] std = ["assembly/std", "processor/std", "prover/std", "verifier/std"] # For internal use, not meant to be used by users diff --git a/miden/README.md b/miden/README.md index 710e1305dd..4ddffbe679 100644 --- a/miden/README.md +++ b/miden/README.md @@ -250,6 +250,9 @@ make exec # build an executable for Apple silicon (concurrent+metal) make exec-metal +# build an executable for Nvidia GPU (concurrent+cuda) +make exec-cuda + # build an executable for targets with AVX2 instructions (concurrent) make exec-avx2 diff --git a/miden/src/cli/prove.rs b/miden/src/cli/prove.rs index 1a0d5a59ea..51786f4b02 100644 --- a/miden/src/cli/prove.rs +++ b/miden/src/cli/prove.rs @@ -4,6 +4,9 @@ use assembly::diagnostics::{IntoDiagnostic, Report, WrapErr}; use clap::Parser; use miden_vm::{internal::InputFile, ProvingOptions}; use processor::{DefaultHost, ExecutionOptions, ExecutionOptionsError, Program}; +#[cfg(all(target_arch = "x86_64", feature = "cuda"))] +use prover::cuda::get_num_of_gpus; +use prover::Prover; use stdlib::StdLibrary; use tracing::instrument; @@ -65,6 +68,11 @@ impl ProveCmd { pub fn get_proof_options(&self) -> Result { let exec_options = ExecutionOptions::new(Some(self.max_cycles), self.expected_cycles, self.trace, false)?; + + let partitions = 1; + #[cfg(all(target_arch = "x86_64", feature = "cuda"))] + let partitions = get_num_of_gpus(); + Ok(match self.security.as_str() { "96bits" => { if self.rpx { @@ -82,7 +90,8 @@ impl ProveCmd { }, other => panic!("{} is not a valid security setting", other), } - .with_execution_options(exec_options)) + .with_execution_options(exec_options) + .with_partitions(partitions)) } pub fn execute(&self) -> Result<(), Report> { @@ -105,10 +114,11 @@ impl ProveCmd { self.get_proof_options().map_err(|err| Report::msg(format!("{err}")))?; // execute program and generate proof - let (stack_outputs, proof) = - prover::prove(&program, stack_inputs, &mut host, proving_options) - .into_diagnostic() - .wrap_err("Failed to prove program")?; + let mut prover = Prover::new(); + let (stack_outputs, proof) = prover + .prove(&program, stack_inputs, &mut host, proving_options) + .into_diagnostic() + .wrap_err("Failed to prove program")?; println!( "Program with hash {} proved in {} ms", diff --git a/miden/src/lib.rs b/miden/src/lib.rs index 61e85e5002..c8ce95d84d 100644 --- a/miden/src/lib.rs +++ b/miden/src/lib.rs @@ -15,7 +15,7 @@ pub use processor::{ ProgramInfo, StackInputs, VmState, VmStateIterator, ZERO, }; pub use prover::{ - math, prove, Digest, ExecutionProof, FieldExtension, HashFunction, InputError, Proof, + math, Digest, ExecutionProof, FieldExtension, HashFunction, InputError, Proof, Prover, ProvingOptions, StackOutputs, Word, }; pub use verifier::{verify, VerificationError}; diff --git a/processor/src/errors.rs b/processor/src/errors.rs index 4851ff83d1..78a7006a9d 100644 --- a/processor/src/errors.rs +++ b/processor/src/errors.rs @@ -92,6 +92,8 @@ pub enum ExecutionError { NoMastForestWithProcedure { root_digest: Digest }, #[error("memory address cannot exceed 2^32 but was {0}")] MemoryAddressOutOfBounds(u64), + #[error("VM exceeded the memory usage limit {0}")] + MemoryLimitExceeded(usize), #[error( "word memory access at address {addr} in context {ctx} is unaligned at clock cycle {clk}" )] diff --git a/prover/Cargo.toml b/prover/Cargo.toml index bce4cfe7d3..7f34d2c01a 100644 --- a/prover/Cargo.toml +++ b/prover/Cargo.toml @@ -17,7 +17,8 @@ edition.workspace = true async = ["winter-maybe-async/async"] concurrent = ["processor/concurrent", "std", "winter-prover/concurrent"] default = ["std"] -metal = ["dep:miden-gpu", "dep:elsa", "dep:pollster", "concurrent", "std"] +metal = ["miden-gpu/metal", "dep:elsa", "dep:pollster", "concurrent"] +cuda = ["miden-gpu/cuda", "concurrent"] std = ["air/std", "processor/std", "winter-prover/std"] [dependencies] @@ -29,5 +30,11 @@ winter-prover = { package = "winter-prover", version = "0.11", default-features [target.'cfg(all(target_arch = "aarch64", target_os = "macos"))'.dependencies] elsa = { version = "1.9", optional = true } -miden-gpu = { version = "0.4", optional = true } +miden-gpu = { path = "../../miden-gpu", optional = true } pollster = { version = "0.4", optional = true } + +[target.'cfg(target_arch = "x86_64")'.dependencies] +miden-gpu = { path = "../../miden-gpu", optional = true } + +[dev-dependencies] +serial_test = "3.1" diff --git a/prover/README.md b/prover/README.md index b1465b21ac..8675d0df9f 100644 --- a/prover/README.md +++ b/prover/README.md @@ -18,7 +18,7 @@ If the program is executed successfully, the function returns a tuple with 2 ele Here is a simple example of executing a program which pushes two numbers onto the stack and computes their sum: ```Rust use miden_assembly::Assembler; -use miden_prover::{prove, ProvingOptions, StackInputs, DefaultHost}; +use miden_prover::{Prover, ProvingOptions, StackInputs, DefaultHost}; // instantiate the assembler let assembler = Assembler::default(); @@ -27,7 +27,8 @@ let assembler = Assembler::default(); let program = assembler.compile("begin push.3 push.5 add end").unwrap(); // let's execute it and generate a STARK proof -let (outputs, proof) = prove( +let mut prover = Prover::new(); +let (outputs, proof) = prover.prove( &program, StackInputs::default(), // we won't provide any stack inputs &mut DefaultHost::default(), // we'll be using a default host @@ -45,6 +46,7 @@ Miden prover can be compiled with the following features: * `std` - enabled by default and relies on the Rust standard library. * `concurrent` - implies `std` and also enables multi-threaded proof generation. * `metal` - enables [Metal](https://en.wikipedia.org/wiki/Metal_(API))-based acceleration of proof generation (for recursive proofs) on supported platforms (e.g., Apple silicon). +* `cuda` - enables [Cuda](https://en.wikipedia.org/wiki/CUDA)-based acceleration of proof generation (for recursive proofs) on supported platforms. * `no_std` does not rely on the Rust standard library and enables compilation to WebAssembly. * Only the `wasm32-unknown-unknown` and `wasm32-wasip1` targets are officially supported. diff --git a/prover/src/gpu/cuda/mod.rs b/prover/src/gpu/cuda/mod.rs new file mode 100644 index 0000000000..6fa1738078 --- /dev/null +++ b/prover/src/gpu/cuda/mod.rs @@ -0,0 +1,158 @@ +//! This module contains GPU acceleration logic for Nvidia CUDA devices. + +use std::{cell::RefCell, marker::PhantomData}; + +use air::{AuxRandElements, PartitionOptions}; +use miden_gpu::{ + cuda::{constraints::CudaConstraintCommitment, merkle::MerkleTree, trace_lde::CudaTraceLde}, + HashFn, +}; +use processor::crypto::{ElementHasher, Hasher}; +use winter_prover::{ + crypto::Digest, matrix::ColMatrix, CompositionPoly, CompositionPolyTrace, + ConstraintCompositionCoefficients, DefaultConstraintEvaluator, Prover, StarkDomain, TraceInfo, + TracePolyTable, +}; + +use crate::{ + crypto::{RandomCoin, Rpo256}, + ExecutionProver, ExecutionTrace, Felt, FieldElement, ProcessorAir, PublicInputs, + WinterProofOptions, +}; + +#[cfg(test)] +mod tests; + +// CONSTANTS +// ================================================================================================ + +const DIGEST_SIZE: usize = Rpo256::DIGEST_RANGE.end - Rpo256::DIGEST_RANGE.start; + +// CUDA RPO/RPX PROVER +// ================================================================================================ + +/// Wraps an [ExecutionProver] and provides GPU acceleration for building trace commitments. +pub(crate) struct CudaExecutionProver<'g, H, D, R> +where + H: Hasher + ElementHasher, + D: Digest + From<[Felt; DIGEST_SIZE]> + Into<[Felt; DIGEST_SIZE]>, + R: RandomCoin + Send, +{ + main: RefCell<&'g mut [Felt]>, + aux: RefCell<&'g mut [Felt]>, + ce: RefCell<&'g mut [Felt]>, + + pub execution_prover: ExecutionProver, + pub hash_fn: HashFn, + phantom_data: PhantomData, +} + +impl<'g, H, D, R> CudaExecutionProver<'g, H, D, R> +where + H: Hasher + ElementHasher, + D: Digest + From<[Felt; DIGEST_SIZE]> + Into<[Felt; DIGEST_SIZE]>, + R: RandomCoin + Send, +{ + pub fn new( + execution_prover: ExecutionProver, + hash_fn: HashFn, + main: &'g mut [Felt], + aux: &'g mut [Felt], + ce: &'g mut [Felt], + ) -> Self { + CudaExecutionProver { + main: RefCell::new(main), + aux: RefCell::new(aux), + ce: RefCell::new(ce), + execution_prover, + hash_fn, + phantom_data: PhantomData, + } + } +} + +impl<'g, H, D, R> Prover for CudaExecutionProver<'g, H, D, R> +where + H: Hasher + ElementHasher + Send + Sync, + D: Digest + From<[Felt; DIGEST_SIZE]> + Into<[Felt; DIGEST_SIZE]>, + R: RandomCoin + Send, +{ + type BaseField = Felt; + type Air = ProcessorAir; + type Trace = ExecutionTrace; + type VC = MerkleTree<'g, Self::HashFn>; + type HashFn = H; + type RandomCoin = R; + type TraceLde> = CudaTraceLde<'g, E, H>; + type ConstraintCommitment> = + CudaConstraintCommitment<'g, E, H>; + type ConstraintEvaluator<'a, E: FieldElement> = + DefaultConstraintEvaluator<'a, ProcessorAir, E>; + + fn get_pub_inputs(&self, trace: &ExecutionTrace) -> PublicInputs { + self.execution_prover.get_pub_inputs(trace) + } + + fn options(&self) -> &WinterProofOptions { + self.execution_prover.options() + } + + fn new_trace_lde>( + &self, + trace_info: &TraceInfo, + main_trace: &ColMatrix, + domain: &StarkDomain, + partition_options: PartitionOptions, + ) -> (Self::TraceLde, TracePolyTable) { + CudaTraceLde::new( + self.main.take(), + self.aux.take(), + trace_info, + main_trace, + domain, + partition_options, + self.hash_fn, + ) + } + + fn new_evaluator<'a, E: FieldElement>( + &self, + air: &'a ProcessorAir, + aux_rand_elements: Option>, + composition_coefficients: ConstraintCompositionCoefficients, + ) -> Self::ConstraintEvaluator<'a, E> { + self.execution_prover + .new_evaluator(air, aux_rand_elements, composition_coefficients) + } + + fn build_aux_trace( + &self, + main_trace: &Self::Trace, + aux_rand_elements: &AuxRandElements, + ) -> ColMatrix + where + E: FieldElement, + { + self.execution_prover.build_aux_trace(main_trace, aux_rand_elements) + } + + fn build_constraint_commitment( + &self, + composition_poly_trace: CompositionPolyTrace, + num_constraint_composition_columns: usize, + domain: &StarkDomain, + partition_options: PartitionOptions, + ) -> (Self::ConstraintCommitment, CompositionPoly) + where + E: FieldElement, + { + CudaConstraintCommitment::new( + self.ce.take(), + composition_poly_trace, + num_constraint_composition_columns, + domain, + partition_options, + self.hash_fn, + ) + } +} diff --git a/prover/src/gpu/cuda/tests.rs b/prover/src/gpu/cuda/tests.rs new file mode 100644 index 0000000000..7b327df287 --- /dev/null +++ b/prover/src/gpu/cuda/tests.rs @@ -0,0 +1,289 @@ +use alloc::vec::Vec; +use std::vec; + +use air::{ProvingOptions, StarkField}; +use miden_gpu::{cuda::alloc::alloc_pinned, HashFn, DIGEST_SIZE, RATE}; +use processor::{ + crypto::{Hasher, RpoDigest, RpoRandomCoin, Rpx256, RpxDigest, RpxRandomCoin}, + math::fft, + StackInputs, StackOutputs, +}; +use serial_test::serial; +use winter_prover::{crypto::Digest, CompositionPolyTrace, ConstraintCommitment, TraceLde}; + +use crate::{gpu::cuda::CudaExecutionProver, *}; + +fn build_trace_commitment_on_gpu_with_padding_matches_cpu< + R: RandomCoin + Send, + H: ElementHasher + Hasher + Send + Sync, + D: Digest + From<[Felt; DIGEST_SIZE]> + Into<[Felt; DIGEST_SIZE]>, +>( + hash_fn: HashFn, +) { + let is_rpx = matches!(hash_fn, HashFn::Rpx256); + + let num_rows = 1 << 8; + let trace_info = get_trace_info(1, num_rows); + let trace = gen_random_trace(num_rows, RATE + 1); + let domain = StarkDomain::from_twiddles(fft::get_twiddles(num_rows), 8, Felt::GENERATOR); + let partition_options = PartitionOptions::new(1, 8); + + let mut main = alloc_pinned(102400, Felt::ZERO); + let mut aux = alloc_pinned(102400, Felt::ZERO); + let mut ce = alloc_pinned(102400, Felt::ZERO); + let gpu_prover = CudaExecutionProver::new( + create_test_prover::(is_rpx), + hash_fn, + &mut main, + &mut aux, + &mut ce, + ); + let cpu_prover = create_test_prover::(is_rpx); + + let (cpu_trace_lde, cpu_polys) = + cpu_prover.new_trace_lde::(&trace_info, &trace, &domain, partition_options); + let (gpu_trace_lde, gpu_polys) = + gpu_prover.new_trace_lde::(&trace_info, &trace, &domain, partition_options); + + assert_eq!( + cpu_trace_lde.get_main_trace_commitment(), + gpu_trace_lde.get_main_trace_commitment() + ); + assert_eq!( + cpu_polys.main_trace_polys().collect::>(), + gpu_polys.main_trace_polys().collect::>() + ); +} + +fn build_trace_commitment_on_gpu_without_padding_matches_cpu< + R: RandomCoin + Send, + H: ElementHasher + Hasher + Send + Sync, + D: Digest + From<[Felt; DIGEST_SIZE]> + Into<[Felt; DIGEST_SIZE]>, +>( + hash_fn: HashFn, +) { + let is_rpx = matches!(hash_fn, HashFn::Rpx256); + + let num_rows = 1 << 8; + let trace_info = get_trace_info(8, num_rows); + let trace = gen_random_trace(num_rows, RATE); + let domain = StarkDomain::from_twiddles(fft::get_twiddles(num_rows), 8, Felt::GENERATOR); + let partition_options = PartitionOptions::new(1, 8); + + let mut main = alloc_pinned(102400, Felt::ZERO); + let mut aux = alloc_pinned(102400, Felt::ZERO); + let mut ce = alloc_pinned(102400, Felt::ZERO); + let gpu_prover = CudaExecutionProver::new( + create_test_prover::(is_rpx), + hash_fn, + &mut main, + &mut aux, + &mut ce, + ); + let cpu_prover = create_test_prover::(is_rpx); + + let (cpu_trace_lde, cpu_polys) = + cpu_prover.new_trace_lde::(&trace_info, &trace, &domain, partition_options); + let (gpu_trace_lde, gpu_polys) = + gpu_prover.new_trace_lde::(&trace_info, &trace, &domain, partition_options); + + assert_eq!( + cpu_trace_lde.get_main_trace_commitment(), + gpu_trace_lde.get_main_trace_commitment() + ); + assert_eq!( + cpu_polys.main_trace_polys().collect::>(), + gpu_polys.main_trace_polys().collect::>() + ); +} + +fn build_constraint_commitment_on_gpu_with_padding_matches_cpu< + R: RandomCoin + Send, + H: ElementHasher + Hasher + Send + Sync, + D: Digest + From<[Felt; DIGEST_SIZE]> + Into<[Felt; DIGEST_SIZE]>, +>( + hash_fn: HashFn, +) { + let is_rpx = matches!(hash_fn, HashFn::Rpx256); + + let num_rows = 1 << 8; + let ce_blowup_factor = 2; + let values = get_random_values::(num_rows * ce_blowup_factor); + let domain = StarkDomain::from_twiddles(fft::get_twiddles(num_rows), 8, Felt::GENERATOR); + let partition_options = PartitionOptions::new(1, 8); + + let mut main = alloc_pinned(102400, Felt::ZERO); + let mut aux = alloc_pinned(102400, Felt::ZERO); + let mut ce = alloc_pinned(102400, Felt::ZERO); + let gpu_prover = CudaExecutionProver::new( + create_test_prover::(is_rpx), + hash_fn, + &mut main, + &mut aux, + &mut ce, + ); + let cpu_prover = create_test_prover::(is_rpx); + + let (commitment_cpu, composition_poly_cpu) = cpu_prover.build_constraint_commitment( + CompositionPolyTrace::new(values.clone()), + 2, + &domain, + partition_options, + ); + let (commitment_gpu, composition_poly_gpu) = gpu_prover.build_constraint_commitment( + CompositionPolyTrace::new(values), + 2, + &domain, + partition_options, + ); + + assert_eq!(commitment_cpu.commitment(), commitment_gpu.commitment()); + assert_ne!(0, composition_poly_cpu.data().num_base_cols() % RATE); + assert_eq!(composition_poly_cpu.into_columns(), composition_poly_gpu.into_columns()); +} + +fn build_constraint_commitment_on_gpu_without_padding_matches_cpu< + R: RandomCoin + Send, + H: ElementHasher + Hasher + Send + Sync, + D: Digest + From<[Felt; DIGEST_SIZE]> + Into<[Felt; DIGEST_SIZE]>, +>( + hash_fn: HashFn, +) { + let is_rpx = matches!(hash_fn, HashFn::Rpx256); + + let num_rows = 1 << 8; + let ce_blowup_factor = 8; + let values = get_random_values::(num_rows * ce_blowup_factor); + let domain = StarkDomain::from_twiddles(fft::get_twiddles(num_rows), 8, Felt::GENERATOR); + let partition_options = PartitionOptions::new(1, 8); + + let mut main = alloc_pinned(102400, Felt::ZERO); + let mut aux = alloc_pinned(102400, Felt::ZERO); + let mut ce = alloc_pinned(102400, Felt::ZERO); + let gpu_prover = CudaExecutionProver::new( + create_test_prover::(is_rpx), + hash_fn, + &mut main, + &mut aux, + &mut ce, + ); + let cpu_prover = create_test_prover::(is_rpx); + + let (commitment_cpu, composition_poly_cpu) = cpu_prover.build_constraint_commitment( + CompositionPolyTrace::new(values.clone()), + 8, + &domain, + partition_options, + ); + let (commitment_gpu, composition_poly_gpu) = gpu_prover.build_constraint_commitment( + CompositionPolyTrace::new(values), + 8, + &domain, + partition_options, + ); + + assert_eq!(commitment_cpu.commitment(), commitment_gpu.commitment()); + assert_eq!(0, composition_poly_cpu.data().num_base_cols() % RATE); + assert_eq!(composition_poly_cpu.into_columns(), composition_poly_gpu.into_columns()); +} + +#[test] +#[serial] +fn rpo_build_trace_commitment_on_gpu_with_padding_matches_cpu() { + build_trace_commitment_on_gpu_with_padding_matches_cpu::( + HashFn::Rpo256, + ); +} + +#[test] +#[serial] +fn rpx_build_trace_commitment_on_gpu_with_padding_matches_cpu() { + build_trace_commitment_on_gpu_with_padding_matches_cpu::( + HashFn::Rpx256, + ); +} + +#[test] +#[serial] +fn rpo_build_trace_commitment_on_gpu_without_padding_matches_cpu() { + build_trace_commitment_on_gpu_without_padding_matches_cpu::( + HashFn::Rpo256, + ); +} + +#[test] +#[serial] +fn rpx_build_trace_commitment_on_gpu_without_padding_matches_cpu() { + build_trace_commitment_on_gpu_without_padding_matches_cpu::( + HashFn::Rpx256, + ); +} + +#[test] +#[serial] +fn rpo_build_constraint_commitment_on_gpu_with_padding_matches_cpu() { + build_constraint_commitment_on_gpu_with_padding_matches_cpu::( + HashFn::Rpo256, + ); +} + +#[test] +#[serial] +fn rpx_build_constraint_commitment_on_gpu_with_padding_matches_cpu() { + build_constraint_commitment_on_gpu_with_padding_matches_cpu::( + HashFn::Rpx256, + ); +} + +#[test] +#[serial] +fn rpo_build_constraint_commitment_on_gpu_without_padding_matches_cpu() { + build_constraint_commitment_on_gpu_without_padding_matches_cpu::< + RpoRandomCoin, + Rpo256, + RpoDigest, + >(HashFn::Rpo256); +} + +#[test] +#[serial] +fn rpx_build_constraint_commitment_on_gpu_without_padding_matches_cpu() { + build_constraint_commitment_on_gpu_without_padding_matches_cpu::< + RpxRandomCoin, + Rpx256, + RpxDigest, + >(HashFn::Rpx256); +} + +fn gen_random_trace(num_rows: usize, num_cols: usize) -> ColMatrix { + ColMatrix::new((0..num_cols as u64).map(|col| vec![Felt::new(col); num_rows]).collect()) +} + +fn get_random_values(num_rows: usize) -> Vec { + (0..num_rows).map(|i| E::from(i as u32)).collect() +} + +fn get_trace_info(num_cols: usize, num_rows: usize) -> TraceInfo { + TraceInfo::new(num_cols, num_rows) +} + +fn create_test_prover< + R: RandomCoin + Send, + H: ElementHasher, +>( + use_rpx: bool, +) -> ExecutionProver { + if use_rpx { + ExecutionProver::new( + ProvingOptions::with_128_bit_security_rpx(), + StackInputs::default(), + StackOutputs::default(), + ) + } else { + ExecutionProver::new( + ProvingOptions::with_128_bit_security(true), + StackInputs::default(), + StackOutputs::default(), + ) + } +} diff --git a/prover/src/gpu/mod.rs b/prover/src/gpu/mod.rs index 253db30f7c..4eba753953 100644 --- a/prover/src/gpu/mod.rs +++ b/prover/src/gpu/mod.rs @@ -1,2 +1,5 @@ #[cfg(all(feature = "metal", target_arch = "aarch64", target_os = "macos"))] pub mod metal; + +#[cfg(all(feature = "cuda", target_arch = "x86_64"))] +pub mod cuda; diff --git a/prover/src/lib.rs b/prover/src/lib.rs index 25fb837dbd..ba0ded8615 100644 --- a/prover/src/lib.rs +++ b/prover/src/lib.rs @@ -7,9 +7,18 @@ extern crate alloc; extern crate std; use core::marker::PhantomData; +use std::env; -use air::{AuxRandElements, PartitionOptions, ProcessorAir, PublicInputs}; -#[cfg(all(feature = "metal", target_arch = "aarch64", target_os = "macos"))] +use air::{ + trace::{AUX_TRACE_WIDTH, TRACE_WIDTH}, + AuxRandElements, PartitionOptions, ProcessorAir, PublicInputs, +}; +#[cfg(all(target_arch = "x86_64", feature = "cuda"))] +use miden_gpu::cuda::util::CudaStorageOwned; +#[cfg(any( + all(feature = "metal", target_arch = "aarch64", target_os = "macos"), + all(feature = "cuda", target_arch = "x86_64") +))] use miden_gpu::HashFn; use processor::{ crypto::{ @@ -24,7 +33,8 @@ use winter_maybe_async::{maybe_async, maybe_await}; use winter_prover::{ matrix::ColMatrix, CompositionPoly, CompositionPolyTrace, ConstraintCompositionCoefficients, DefaultConstraintCommitment, DefaultConstraintEvaluator, DefaultTraceLde, - ProofOptions as WinterProofOptions, Prover, StarkDomain, TraceInfo, TracePolyTable, + ProofOptions as WinterProofOptions, Prover as WinterProver, StarkDomain, TraceInfo, + TracePolyTable, }; #[cfg(feature = "std")] use {std::time::Instant, winter_prover::Trace}; @@ -34,6 +44,10 @@ mod gpu; // ================================================================================================ pub use air::{DeserializationError, ExecutionProof, FieldExtension, HashFunction, ProvingOptions}; +#[cfg(all(target_arch = "x86_64", feature = "cuda"))] +pub mod cuda { + pub use miden_gpu::cuda::get_num_of_gpus; +} pub use processor::{ crypto, math, utils, AdviceInputs, Digest, ExecutionError, Host, InputError, MemAdviceProvider, StackInputs, StackOutputs, Word, @@ -43,85 +57,130 @@ pub use winter_prover::{crypto::MerkleTree as MerkleTreeVC, Proof}; // PROVER // ================================================================================================ -/// Executes and proves the specified `program` and returns the result together with a STARK-based -/// proof of the program's execution. -/// -/// - `stack_inputs` specifies the initial state of the stack for the VM. -/// - `host` specifies the host environment which contain non-deterministic (secret) inputs for the -/// prover -/// - `options` defines parameters for STARK proof generation. +/// Program executor and a STARK proover. /// -/// # Errors -/// Returns an error if program execution or STARK proof generation fails for any reason. -#[instrument("prove_program", skip_all)] -#[maybe_async] -pub fn prove( - program: &Program, - stack_inputs: StackInputs, - host: &mut impl Host, - options: ProvingOptions, -) -> Result<(StackOutputs, ExecutionProof), ExecutionError> { - // execute the program to create an execution trace - #[cfg(feature = "std")] - let now = Instant::now(); - let trace = - processor::execute(program, stack_inputs.clone(), host, *options.execution_options())?; - #[cfg(feature = "std")] - tracing::event!( - tracing::Level::INFO, - "Generated execution trace of {} columns and {} steps ({}% padded) in {} ms", - trace.info().main_trace_width(), - trace.trace_len_summary().padded_trace_len(), - trace.trace_len_summary().padding_percentage(), - now.elapsed().as_millis() - ); +/// It allows concrete implementations of hardware-accelerated provers to store reusable buffers +/// for better performance. +pub struct Prover { + #[cfg(all(feature = "cuda", target_arch = "x86_64"))] + storage: CudaStorageOwned, +} - let stack_outputs = trace.stack_outputs().clone(); - let hash_fn = options.hash_fn(); +impl Prover { + #[cfg(not(all(feature = "cuda", target_arch = "x86_64")))] + pub fn new() -> Self { + Self {} + } - // generate STARK proof - let proof = match hash_fn { - HashFunction::Blake3_192 => { - let prover = ExecutionProver::>::new( - options, - stack_inputs, - stack_outputs.clone(), - ); - maybe_await!(prover.prove(trace)) - }, - HashFunction::Blake3_256 => { - let prover = ExecutionProver::>::new( - options, - stack_inputs, - stack_outputs.clone(), - ); - maybe_await!(prover.prove(trace)) - }, - HashFunction::Rpo256 => { - let prover = ExecutionProver::::new( - options, - stack_inputs, - stack_outputs.clone(), - ); - #[cfg(all(feature = "metal", target_arch = "aarch64", target_os = "macos"))] - let prover = gpu::metal::MetalExecutionProver::new(prover, HashFn::Rpo256); - maybe_await!(prover.prove(trace)) - }, - HashFunction::Rpx256 => { - let prover = ExecutionProver::::new( - options, - stack_inputs, - stack_outputs.clone(), - ); - #[cfg(all(feature = "metal", target_arch = "aarch64", target_os = "macos"))] - let prover = gpu::metal::MetalExecutionProver::new(prover, HashFn::Rpx256); - maybe_await!(prover.prove(trace)) - }, + #[cfg(all(feature = "cuda", target_arch = "x86_64"))] + #[instrument("create_prover", skip_all)] + pub fn new() -> Self { + let buffer_size = env::var("MIDEN_CUDA_MEM_SIZE_MB") + .expect("Provide MIDEN_CUDA_MEM_SIZE_MB env variable to specify the max CUDA buffer size (in megabytes)") + .parse::() + .expect("MIDEN_CUDA_MEM_SIZE_MB must be a number"); + Self { + storage: CudaStorageOwned::new(1024 * 1024 * buffer_size), + } } - .map_err(ExecutionError::ProverError)?; - let proof = ExecutionProof::new(proof, hash_fn); - Ok((stack_outputs, proof)) + /// Executes and proves the specified `program` and returns the result together with a + /// STARK-based proof of the program's execution. + /// + /// - `stack_inputs` specifies the initial state of the stack for the VM. + /// - `host` specifies the host environment which contain non-deterministic (secret) inputs for + /// the prover + /// - `options` defines parameters for STARK proof generation. + /// + /// # Errors + /// Returns an error if program execution or STARK proof generation fails for any reason. + #[instrument("prove_program", skip_all)] + #[maybe_async] + pub fn prove( + &mut self, + program: &Program, + stack_inputs: StackInputs, + host: &mut impl Host, + options: ProvingOptions, + ) -> Result<(StackOutputs, ExecutionProof), ExecutionError> { + // execute the program to create an execution trace + #[cfg(feature = "std")] + let now = Instant::now(); + let trace = + processor::execute(program, stack_inputs.clone(), host, *options.execution_options())?; + #[cfg(feature = "std")] + tracing::event!( + tracing::Level::INFO, + "Generated execution trace of {} columns and {} steps ({}% padded) in {} ms", + trace.info().main_trace_width(), + trace.trace_len_summary().padded_trace_len(), + trace.trace_len_summary().padding_percentage(), + now.elapsed().as_millis() + ); + + let stack_outputs = trace.stack_outputs().clone(); + let hash_fn = options.hash_fn(); + + #[cfg(all(feature = "cuda", target_arch = "x86_64"))] + let Some((main, aux, ce)) = self.storage.borrow_mut( + TRACE_WIDTH, + AUX_TRACE_WIDTH, + trace.get_trace_len(), + options.clone().into(), + ) else { + return Err(ExecutionError::MemoryLimitExceeded(self.storage.capacity())); + }; + + // generate STARK proof + let proof = match hash_fn { + HashFunction::Blake3_192 => { + let prover = ExecutionProver::>::new( + options, + stack_inputs, + stack_outputs.clone(), + ); + maybe_await!(prover.prove(trace)) + }, + HashFunction::Blake3_256 => { + let prover = ExecutionProver::>::new( + options, + stack_inputs, + stack_outputs.clone(), + ); + maybe_await!(prover.prove(trace)) + }, + HashFunction::Rpo256 => { + let prover = ExecutionProver::::new( + options, + stack_inputs, + stack_outputs.clone(), + ); + #[cfg(all(feature = "metal", target_arch = "aarch64", target_os = "macos"))] + let prover = gpu::metal::MetalExecutionProver::new(prover, HashFn::Rpo256); + #[cfg(all(feature = "cuda", target_arch = "x86_64"))] + let prover = + gpu::cuda::CudaExecutionProver::new(prover, HashFn::Rpo256, main, aux, ce); + maybe_await!(prover.prove(trace)) + }, + HashFunction::Rpx256 => { + let prover = ExecutionProver::::new( + options, + stack_inputs, + stack_outputs.clone(), + ); + #[cfg(all(feature = "metal", target_arch = "aarch64", target_os = "macos"))] + let prover = gpu::metal::MetalExecutionProver::new(prover, HashFn::Rpx256); + #[cfg(all(feature = "cuda", target_arch = "x86_64"))] + let prover = + gpu::cuda::CudaExecutionProver::new(prover, HashFn::Rpx256, main, aux, ce); + maybe_await!(prover.prove(trace)) + }, + } + .map_err(ExecutionError::ProverError)?; + let proof = ExecutionProof::new(proof, hash_fn); + + Ok((stack_outputs, proof)) + } } // PROVER @@ -176,7 +235,7 @@ where } } -impl Prover for ExecutionProver +impl WinterProver for ExecutionProver where H: ElementHasher + Sync, R: RandomCoin + Send, diff --git a/test-utils/src/lib.rs b/test-utils/src/lib.rs index f5fd3ba85f..2d45bc287c 100644 --- a/test-utils/src/lib.rs +++ b/test-utils/src/lib.rs @@ -13,6 +13,7 @@ use alloc::{ vec::Vec, }; +use air::ExecutionProof; pub use assembly::{diagnostics::Report, LibraryPath, SourceFile, SourceManager}; use assembly::{KernelLibrary, Library}; pub use pretty_assertions::{assert_eq, assert_ne, assert_str_eq}; @@ -23,8 +24,8 @@ pub use processor::{ }; #[cfg(not(target_family = "wasm"))] use proptest::prelude::{Arbitrary, Strategy}; -use prover::utils::range; -pub use prover::{prove, MemAdviceProvider, MerkleTreeVC, ProvingOptions}; +use prover::{utils::range, Host}; +pub use prover::{MemAdviceProvider, MerkleTreeVC, Prover, ProvingOptions}; pub use test_case::test_case; pub use verifier::{verify, AcceptableOptions, VerifierError}; use vm_core::{chiplets::hasher::apply_permutation, ProgramInfo}; @@ -82,6 +83,19 @@ proc.truncate_stack.4 end "; +// HELPERS +// ================================================================================================ + +pub fn prove( + program: &Program, + stack_inputs: StackInputs, + host: &mut impl Host, + options: ProvingOptions, +) -> Result<(StackOutputs, ExecutionProof), ExecutionError> { + let mut prover = Prover::new(); + prover.prove(program, stack_inputs, host, options) +} + // TEST HANDLER // ================================================================================================ @@ -379,8 +393,7 @@ impl Test { host.load_mast_forest(library.mast_forest().clone()).unwrap(); } let (mut stack_outputs, proof) = - prover::prove(&program, stack_inputs.clone(), &mut host, ProvingOptions::default()) - .unwrap(); + prove(&program, stack_inputs.clone(), &mut host, ProvingOptions::default()).unwrap(); let program_info = ProgramInfo::from(program); if test_fail { diff --git a/verifier/src/lib.rs b/verifier/src/lib.rs index 03962b6070..7b367f844d 100644 --- a/verifier/src/lib.rs +++ b/verifier/src/lib.rs @@ -83,6 +83,12 @@ pub fn verify( let opts = AcceptableOptions::OptionSet(vec![ ProvingOptions::RECURSIVE_96_BITS, ProvingOptions::RECURSIVE_128_BITS, + ProvingOptions::RECURSIVE_96_BITS.with_partitions(1, 8), + ProvingOptions::RECURSIVE_128_BITS.with_partitions(1, 8), + ProvingOptions::RECURSIVE_96_BITS.with_partitions(2, 8), + ProvingOptions::RECURSIVE_128_BITS.with_partitions(2, 8), + ProvingOptions::RECURSIVE_96_BITS.with_partitions(4, 8), + ProvingOptions::RECURSIVE_128_BITS.with_partitions(4, 8), ]); verify_proof::>( proof, pub_inputs, &opts, @@ -92,6 +98,12 @@ pub fn verify( let opts = AcceptableOptions::OptionSet(vec![ ProvingOptions::RECURSIVE_96_BITS, ProvingOptions::RECURSIVE_128_BITS, + ProvingOptions::RECURSIVE_96_BITS.with_partitions(1, 8), + ProvingOptions::RECURSIVE_128_BITS.with_partitions(1, 8), + ProvingOptions::RECURSIVE_96_BITS.with_partitions(2, 8), + ProvingOptions::RECURSIVE_128_BITS.with_partitions(2, 8), + ProvingOptions::RECURSIVE_96_BITS.with_partitions(4, 8), + ProvingOptions::RECURSIVE_128_BITS.with_partitions(4, 8), ]); verify_proof::>( proof, pub_inputs, &opts,