diff --git a/.gitignore b/.gitignore index 3b09cfb4..29f3d39d 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,9 @@ help.sh docker.log docs/package-lock.json +__pycache__ +*.pyc + # auto-generated script build_wrapper.sh @@ -14,6 +17,10 @@ logging/* debug debug/* +# benchmark outputs +benchmark/results +benchmark/results/* + compile_wrapper.sh *.tar.gz diff --git a/Project.toml b/Project.toml index c898d6b4..feca11f0 100644 --- a/Project.toml +++ b/Project.toml @@ -2,12 +2,8 @@ name = "cuNumeric" uuid = "0fd9ffd4-7e84-4cd0-b8f8-645bd8c73620" version = "0.1.1" -[workspace] -projects = ["test", "dev"] - [deps] CNPreferences = "3e078157-ea10-49d5-bf32-908f777cd46f" -CUDA_SDK_jll = "6cbf2f2e-7e60-5632-ac76-dca2274e0be0" CxxWrap = "1f15a43c-97ca-5a2a-ae31-89f07a497df4" JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899" Legate = "1238f2cf-6593-4d60-9aca-2f5364e49909" @@ -33,7 +29,6 @@ CUDAExt = "CUDA" [compat] CNPreferences = "0.1.2" CUDA = "5.9" -CUDA_SDK_jll = "13" CxxWrap = "0.17" JuliaFormatter = "2.3.0" Legate = "0.1.2" @@ -47,3 +42,6 @@ StatsBase = "0.34" cunumeric_jl_wrapper_jll = "25.10.3" cupynumeric_jll = "25.10.3" julia = "1.10" + +[workspace] +projects = ["test", "dev"] diff --git a/benchmark/Project.toml b/benchmark/Project.toml new file mode 100644 index 00000000..62eb4c27 --- /dev/null +++ b/benchmark/Project.toml @@ -0,0 +1,12 @@ +[deps] +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2" +Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" +Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" +cuNumeric = "0fd9ffd4-7e84-4cd0-b8f8-645bd8c73620" + +[extras] +CNPreferences = "3e078157-ea10-49d5-bf32-908f777cd46f" +LegatePreferences = "8028f36a-2b64-49e9-aa04-2d0933fd2ed9" diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 00000000..753a0416 --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,78 @@ +# Benchmark configuration + +Benchmarks are declared in `benchmarks.toml`. `run.jl` parses it. + +## Running + +```bash +julia --project run.jl # runs whatever benchmarks.toml configures +``` + +`run.jl` runs each (benchmark, backend) pair in its own process via +`run_benchmark.sh`, so backends never share a GPU/runtime within a measurement. +cuNumeric always runs; extra comparison backends are toggled in `[Global]`: + +- `cuda = true` → also run under CUDA.jl (single-GPU configs only; CUDA.jl is + single-device). +- `cupynumeric = true` → also run under cupynumeric (see below). + +### Comparing against cupynumeric + +cupynumeric runs in a conda env whose major.minor matches this project's +resolved `cupynumeric_jll`. Build it once: + +```bash +./install_cupynumeric.sh # creates env cupynumeric-bench- +``` + +`run.jl` derives the env name automatically; override it with `CUPYNUMERIC_ENV`. + +## Layout + +```toml +[Global] +n_warmup = 5 +n_iter = 1000 +n_trial = 5 + +[[gemm]] # name registered in src/benchmarks.jl +T = "Float32" # element type +gpus = 1 +cpus = 2 +N = 150 +M = 150 # optional, defaults to 1 +``` + +Repeat a `[[name]]` block to add independent configs. + +## Lists + +Any of `T`, `gpus`, `cpus`, `N`, `M` may be a list. They expand along +two axes: +- **`T` multiply.** The whole sweep runs once per type. +- **`gpus`, `cpus`, `N`, `M` zip** into a single lockstep sweep — element `i` + of each is paired together. + +Each zipped field must be one of: + +- a scalar or single-element list (`cpus = 2` or `[2]`) -> broadcast to every config +- a list whose length equals the sweep length + +Any other length mismatch is an error. + +```toml +[[sgemm]] +T = ["Float64", "Float32"] # multiplies +gpus = [1, 2, 4] # +cpus = 2 # zip -> (1,2,150,150), (2,2,300,300), (4,2,600,600) +N = [150, 300, 600] # +M = [150, 300, 600] # +``` + +-> 2 types * 3 sweep points = **6 runs**. + +### Gotcha + +When `T = ["Float32", "Float64"]` and a length-2 `N`/`M` sweep you get all **4** +combinations, not a paired `Float32 -> N[1], Float64 -> N[2]`. To pin a type +to a specific size, use separate `[[name]]` blocks. diff --git a/benchmark/benchmarks.toml b/benchmark/benchmarks.toml new file mode 100644 index 00000000..688ae9e2 --- /dev/null +++ b/benchmark/benchmarks.toml @@ -0,0 +1,48 @@ +[Global] +n_warmup = 5 +n_iter = 1000 +n_trial = 5 +cupynumeric = true # (needs install_cupynumeric.sh) +cuda = false # compare against CUDA.jl (single-GPU configs only) + +#################################### +# GEMM # +# Work ~ 2*N^2*M. Hold N, scale M. # +#################################### + +[[gemm]] +T = ["Float32", "Float64"] +gpus = [1, 2, 4, 8] +cpus = 2 +N = 4096 +M = [4096, 8192, 16384, 32768] + +################################# +# Gray-Scott # +# Work ~ N*M. Hold N, scale M. # +################################# + +[[grayscott_baseline]] +T = "Float32" +gpus = [1, 2, 4, 8] +cpus = 2 +N = 1024 +M = [1024, 2048, 4096, 8192] + +[[grayscott_lifetimes]] +T = "Float32" +gpus = [1, 2, 4, 8] +cpus = 2 +N = 1024 +M = [1024, 2048, 4096, 8192] + +################################# +# Monte-Carlo Integration # +# Work ~ N. Scale N linearly # +################################# + +[[montecarlo]] +T = "Float32" +gpus = [1, 2, 4, 8] +cpus = 2 +N = [1_000_000, 2_000_000, 4_000_000, 8_000_000] diff --git a/benchmark/install_cupynumeric.sh b/benchmark/install_cupynumeric.sh new file mode 100755 index 00000000..541a654c --- /dev/null +++ b/benchmark/install_cupynumeric.sh @@ -0,0 +1,73 @@ +#!/bin/bash +# Install a cupynumeric conda env matching the cupynumeric_jll our project resolves. +# The conda package and the JLL share the calendar-versioning scheme (e.g. 25.10), +# so we pin major.minor (patch ignored) and install from the legate channel. +# +# Usage: +# ./install_cupynumeric.sh # create a fresh env named cupynumeric-bench- +# ./install_cupynumeric.sh --name myenv # override the env name +# ./install_cupynumeric.sh --into existing # install into an existing env instead of creating one +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +ENV_NAME="" +INTO_ENV="" + +while [[ $# -gt 0 ]]; do + case $1 in + --name) + ENV_NAME=$2 + shift 2 + ;; + --into) + INTO_ENV=$2 + shift 2 + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 [--name ] [--into ]" + exit 1 + ;; + esac +done + +# Resolve the JLL version Julia actually instantiated for this project, then keep +# major.minor only — conda packages are not published per patch. +echo "Detecting cupynumeric_jll version from the benchmark project..." +VER=$(cd "$SCRIPT_DIR" && julia --project -e ' +using Pkg +for (_, info) in Pkg.dependencies() + info.name == "cupynumeric_jll" || continue + v = info.version + isnothing(v) && continue + println("$(v.major).$(v.minor)") +end' | tail -1) + +if [[ -z "$VER" ]]; then + echo "Error: could not detect cupynumeric_jll version. Has the project been instantiated?" + exit 1 +fi + +echo "cupynumeric_jll major.minor: $VER" +SPEC="cupynumeric=$VER.*" + +if [[ -n "$INTO_ENV" ]]; then + echo "Installing $SPEC into existing env '$INTO_ENV'..." + conda install -y -n "$INTO_ENV" -c conda-forge -c legate "$SPEC" + echo "Done. Activate with: conda activate $INTO_ENV" + exit 0 +fi + +[[ -z "$ENV_NAME" ]] && ENV_NAME="cupynumeric-bench-$VER" + +if conda env list | awk '{print $1}' | grep -qx "$ENV_NAME"; then + echo "Env '$ENV_NAME' already exists with $SPEC; nothing to do." + echo "Activate with: conda activate $ENV_NAME" + exit 0 +fi + +echo "Creating env '$ENV_NAME' with $SPEC..." +conda create -y -n "$ENV_NAME" -c conda-forge -c legate "$SPEC" + +echo "Done. Activate with: conda activate $ENV_NAME" diff --git a/benchmark/run.jl b/benchmark/run.jl new file mode 100644 index 00000000..7d6c3bb4 --- /dev/null +++ b/benchmark/run.jl @@ -0,0 +1,103 @@ +# run.jl: orchestrator. Builds one run_benchmark.sh command per benchmark and +# dispatches it; the script sets LEGATE_CONFIG (from --gpus/--cpus) before +# launching the worker (single.jl) that actually runs the benchmark. +# no args -> one command per benchmarks.toml entry +# with args -> one command from + +# Orchestrator stays off the GPU: it only needs GlobalSettings + parse_config, +# both cuNumeric-free. The worker (single.jl) loads cuNumeric and the kernels. + +using Pkg + +include("src/core.jl") +include("src/parse_benchmarks.jl") + +const RUNNER = joinpath(@__DIR__, "run_benchmark.sh") +const WORKER = joinpath(@__DIR__, "src/single.jl") +const PY_WORKER = joinpath(@__DIR__, "src_py/single.py") + +banner(msg) = println("\n", "="^128, "\n", msg, "\n", "="^128) + +# `_lifetimes` is a cuNumeric-only code-path variant (@analyze_lifetimes) +cunumeric_only(name) = endswith(name, "_lifetimes") + +# ensure things are resolved and devlop'd properly +function ensure_project_ready() + Pkg.develop(; path=joinpath(@__DIR__, "..")) + Pkg.instantiate() +end + +# default env name mirrors install_cupynumeric.sh: cupynumeric-bench-. +# CUPYNUMERIC_ENV overrides it. +function cupynumeric_env_name() + haskey(ENV, "CUPYNUMERIC_ENV") && return ENV["CUPYNUMERIC_ENV"] + for (_, info) in Pkg.dependencies() + info.name == "cupynumeric_jll" || continue + info.version === nothing && continue + return "cupynumeric-bench-$(info.version.major).$(info.version.minor)" + end + error("could not resolve cupynumeric_jll version; set CUPYNUMERIC_ENV explicitly") +end + +function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial, + cupynumeric=false, cudajl=false) + banner( + "$(name): T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " * + "n_iter=$(n_iter) n_warmup=$(n_warmup) n_trial=$(n_trial)", + ) + + # each backend runs in its own worker process + args = `--gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial` + cmds = [`bash $RUNNER $WORKER $args cunumeric`] + # CUDA.jl is single-GPU only + if cudajl && gpus == 1 && !cunumeric_only(name) + push!(cmds, `bash $RUNNER $WORKER $args cudajl`) + end + if cupynumeric && !cunumeric_only(name) + push!(cmds, `bash $RUNNER $PY_WORKER --pyenv $(cupynumeric_env_name()) $args`) + end + + for cmd in cmds + try + run(cmd) + catch e + @error "Benchmark '$(name)' failed; continuing." exception = e + end + end +end + +function run_all_benchmarks(config="benchmarks.toml") + gs, specs = parse_config(joinpath(@__DIR__, config)) + for spec in specs + N, M = spec.args + dispatch(; + gpus=spec.gpus, + cpus=spec.cpus, + name=spec.name, + T=spec.T, + N=N, M=M, + n_iter=gs.n_iter, + n_warmup=gs.n_warmup, + n_trial=gs.n_trial, + cupynumeric=gs.cupynumeric, + cudajl=gs.cuda, + ) + end +end + +ensure_project_ready() +if isempty(ARGS) + run_all_benchmarks() +else # dispatch on args + dispatch(; + gpus=parse(Int, ARGS[1]), + cpus=parse(Int, ARGS[2]), + name=ARGS[3], + T=ARGS[4], + N=parse(Int, ARGS[5]), + M=parse(Int, ARGS[6]), + n_iter=parse(Int, ARGS[7]), + n_warmup=parse(Int, ARGS[8]), + n_trial=parse(Int, ARGS[9]), + ) +end diff --git a/benchmark/run_benchmark.sh b/benchmark/run_benchmark.sh index 07a97a05..b802f7bc 100755 --- a/benchmark/run_benchmark.sh +++ b/benchmark/run_benchmark.sh @@ -11,6 +11,7 @@ shift GPUS=0 CPUS=1 +PYENV="" while [[ $# -gt 0 ]]; do case $1 in @@ -22,6 +23,10 @@ while [[ $# -gt 0 ]]; do CPUS=$2 shift 2 ;; + --pyenv) + PYENV=$2 + shift 2 + ;; *) # Collect all other arguments as extra arguments EXTRA_ARGS+=("$1") @@ -43,17 +48,29 @@ if [[ $GPUS -lt 0 ]]; then fi if [[ $CPUS -lt 0 ]]; then - echo "CPUs ivnalid, using cpus = 1" + echo "CPUs invalid, using cpus = 1" exit fi -export LEGATE_AUTO_CONFIG=0 -export LEGATE_CONFIG="--cpus=1 --gpus=$GPUS --omps=$CPUS --ompthreads=3 --utility=2 --sysmem=256 --numamem=19029 --fbmem=7569 --zcmem=128 --regmem=0" +export LEGATE_AUTO_CONFIG=1 +export LEGATE_CONFIG="--cpus=$CPUS --gpus=$GPUS" export LEGATE_SHOW_CONFIG=1 +export LD_LIBRARY_PATH="" + echo "Running $FILENAME with $CPUS CPUs and $GPUS GPUs" -CMD="julia --project='..' $FILENAME $GPUS ${EXTRA_ARGS[@]}" +# Python (cupynumeric) workers run in the conda env built by install_cupynumeric.sh; +# Julia (cuNumeric) workers run against the local project. +if [[ $FILENAME == *.py ]]; then + if [[ -z $PYENV ]]; then + echo "Error: running a .py worker requires --pyenv (run install_cupynumeric.sh first)." + exit 1 + fi + CMD="conda run --no-capture-output -n $PYENV python $FILENAME $GPUS ${EXTRA_ARGS[@]}" +else + CMD="julia --project $FILENAME $GPUS ${EXTRA_ARGS[@]}" +fi printf "Running: %s\n" "$CMD" eval "$CMD" diff --git a/benchmark/sgemm.jl b/benchmark/sgemm.jl deleted file mode 100644 index 28d9ad7c..00000000 --- a/benchmark/sgemm.jl +++ /dev/null @@ -1,56 +0,0 @@ -using cuNumeric -using LinearAlgebra -using Printf - -function initialize_cunumeric(N, M) - A = cuNumeric.as_type(cuNumeric.rand(NDArray, N, M), Float32) - B = cuNumeric.as_type(cuNumeric.rand(NDArray, M, N), Float32) - C = cuNumeric.zeros(Float32, N, N) - GC.gc() # remove the intermediate FP64 arrays - return A, B, C -end - -function total_flops(N, M) - return N * N * ((2*M) - 1) -end - -function total_space(N, M) - return 2 * (N*M) * sizeof(Float32) + (N*N) * sizeof(Float32) -end - -function gemm_cunumeric(N, M, n_samples, n_warmup) - A, B, C = initialize_cunumeric(N, M) - - start_time = nothing - for idx in range(1, n_samples + n_warmup) - if idx == n_warmup + 1 - start_time = get_time_microseconds() - end - - mul!(C, A, B) - end - total_time_μs = get_time_microseconds() - start_time - mean_time_ms = total_time_μs / (n_samples * 1e3) - gflops = total_flops(N, M) / (mean_time_ms * 1e6) # GFLOP is 1e9 - - return mean_time_ms, gflops -end - -gpus = parse(Int, ARGS[1]) -N = parse(Int, ARGS[2]) -M = parse(Int, ARGS[3]) -n_samples = parse(Int, ARGS[4]) -n_warmup = parse(Int, ARGS[5]) - -println( - "[cuNumeric] MATMUL benchmark on $(N)x$(M) matricies for $(n_samples) iterations, $(n_warmup) warmups" -) - -mean_time_ms, gflops = gemm_cunumeric(N, M, n_samples, n_warmup) - -println("[cuNumeric] Mean Run Time: $(mean_time_ms) ms") -println("[cuNumeric] FLOPS: $(gflops) GFLOPS") - -open("./gemm.csv", "a") do io - @printf(io, "%s,%d,%d,%d,%.6f,%.6f\n", "cunumeric", gpus, N, M, mean_time_ms, gflops) -end diff --git a/benchmark/src/benchmarks/gemm.jl b/benchmark/src/benchmarks/gemm.jl new file mode 100644 index 00000000..a4356792 --- /dev/null +++ b/benchmark/src/benchmarks/gemm.jl @@ -0,0 +1,27 @@ +Base.@kwdef struct GEMM{T} <: AbstractBenchmark{T} + N::Int + M::Int +end + +name(::GEMM) = "gemm" +dims(g::GEMM) = (g.N, g.M) +data(g::GEMM{T}) where {T} = "GEMM with T=$(T), N=$(g.N), M=$(g.M)" + +function allowed_types(::Type{GEMM}) + Union{cuNumeric.SUPPORTED_FLOAT_TYPES,cuNumeric.SUPPORTED_INT_TYPES} +end + +total_flops(s::GEMM) = s.N * s.N * ((2*s.M) - 1) +total_space(s::GEMM{T}) where {T} = 2 * ((s.N*s.M) * sizeof(T)) + ((s.N*s.N) * sizeof(T)) + +function initialize(s::GEMM{T}; mod=cuNumeric) where {T} + A = mod.rand(T, s.N, s.M) + B = mod.rand(T, s.M, s.N) + C = mod.zeros(T, s.N, s.N) + GC.gc() + return C, A, B +end + +run!(::GEMM, C, A, B) = mul!(C, A, B) + +register_benchmark("gemm", GEMM) diff --git a/benchmark/src/benchmarks/grayscott.jl b/benchmark/src/benchmarks/grayscott.jl new file mode 100644 index 00000000..a2d51315 --- /dev/null +++ b/benchmark/src/benchmarks/grayscott.jl @@ -0,0 +1,126 @@ +struct GSParams{T} + dx::T + dt::T + c_u::T + c_v::T + f::T + k::T +end + +function GSParams{T}(; dx=1, c_u=1.0, c_v=0.3, f=0.03, k=0.06) where {T} + GSParams{T}(T(dx), T(dx / 5), T(c_u), T(c_v), T(f), T(k)) +end + +abstract type AbstractGrayScott{T} <: AbstractBenchmark{T} end + +Base.@kwdef struct GrayScottBaseline{T} <: AbstractGrayScott{T} + N::Int + M::Int +end + +Base.@kwdef struct GrayScottLifetimes{T} <: AbstractGrayScott{T} + N::Int + M::Int +end + +name(::AbstractGrayScott) = "grayscott" +dims(b::AbstractGrayScott) = (b.N, b.M) +data(b::AbstractGrayScott{T}) where {T} = "GrayScott with T=$(T), N=$(b.N), M=$(b.M)" +allowed_types(::Type{AbstractGrayScott}) = cuNumeric.SUPPORTED_FLOAT_TYPES +total_flops(b::AbstractGrayScott) = b.N * b.M # grid points updated per step + +function build_benchmark(::Type{A}, ::Type{T}, N, M) where {A<:AbstractGrayScott,T} + A{T}(; N=N, M=M) +end + +mutable struct GrayScottState{A,P} + u::A + v::A + u_new::A + v_new::A + params::P +end + +function initialize(b::AbstractGrayScott{T}; mod=cuNumeric) where {T} + d = (b.N, b.M) + u = mod.ones(T, d) + v = mod.zeros(T, d) + u_new = mod.zeros(T, d) + v_new = mod.zeros(T, d) + + seed = min(150, b.N, b.M) + u[1:seed, 1:seed] = mod.rand(T, (seed, seed)) + v[1:seed, 1:seed] = mod.rand(T, (seed, seed)) + + return (GrayScottState(u, v, u_new, v_new, GSParams{T}()),) +end + +# VARIANT DESCRIPTION +# baseline: as written +# lifetimes: step wrapped in @analyze_lifetimes +let body = quote + # currently we don't have NDArray^x working yet. + F_u = ( + ( + -u[2:(end - 1), 2:(end - 1)] .* + (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)]) + ) + args.f * (1 .- u[2:(end - 1), 2:(end - 1)]) + ) + F_v = ( + ( + u[2:(end - 1), 2:(end - 1)] .* + (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)]) + ) - (args.f + args.k) * v[2:(end - 1), 2:(end - 1)] + ) + # 2-D Laplacian via slicing, excluding boundaries + u_lap = ( + ( + u[3:end, 2:(end - 1)] - 2 * u[2:(end - 1), 2:(end - 1)] + + u[1:(end - 2), 2:(end - 1)] + ) ./ args.dx^2 + + ( + u[2:(end - 1), 3:end] - 2 * u[2:(end - 1), 2:(end - 1)] + + u[2:(end - 1), 1:(end - 2)] + ) ./ args.dx^2 + ) + v_lap = ( + ( + v[3:end, 2:(end - 1)] - 2 * v[2:(end - 1), 2:(end - 1)] + + v[1:(end - 2), 2:(end - 1)] + ) ./ args.dx^2 + + ( + v[2:(end - 1), 3:end] - 2 * v[2:(end - 1), 2:(end - 1)] + + v[2:(end - 1), 1:(end - 2)] + ) ./ args.dx^2 + ) + + # Forward-Euler step for all interior points + u_new[2:(end - 1), 2:(end - 1)] = + ((args.c_u * u_lap) + F_u) * args.dt + u[2:(end - 1), 2:(end - 1)] + v_new[2:(end - 1), 2:(end - 1)] = + ((args.c_v * v_lap) + F_v) * args.dt + v[2:(end - 1), 2:(end - 1)] + + # Periodic boundary conditions + u_new[:, 1] = u[:, end - 1] + u_new[:, end] = u[:, 2] + u_new[1, :] = u[end - 1, :] + u_new[end, :] = u[2, :] + v_new[:, 1] = v[:, end - 1] + v_new[:, end] = v[:, 2] + v_new[1, :] = v[end - 1, :] + v_new[end, :] = v[2, :] + end + @eval _gs_step!(b::GrayScottBaseline, u, v, u_new, v_new, args::GSParams) = $body + @eval _gs_step!(b::GrayScottLifetimes, u, v, u_new, v_new, args::GSParams) = @analyze_lifetimes $body +end + +function run!(b::AbstractGrayScott, st::GrayScottState) + _gs_step!(b, st.u, st.v, st.u_new, st.v_new, st.params) + # swap references rather than copy + st.u, st.u_new = st.u_new, st.u + st.v, st.v_new = st.v_new, st.v + return nothing +end + +register_benchmark("grayscott_baseline", GrayScottBaseline) +register_benchmark("grayscott_lifetimes", GrayScottLifetimes) diff --git a/benchmark/src/benchmarks/montecarlo.jl b/benchmark/src/benchmarks/montecarlo.jl new file mode 100644 index 00000000..1df91c97 --- /dev/null +++ b/benchmark/src/benchmarks/montecarlo.jl @@ -0,0 +1,31 @@ +Base.@kwdef struct MonteCarloIntegration{T} <: AbstractBenchmark{T} + n_samples::Int +end + +name(::MonteCarloIntegration) = "montecarlo" +dims(mci::MonteCarloIntegration) = (mci.n_samples, 1) +function data(mci::MonteCarloIntegration{T}) where {T} + "Monte Carlo Integration with T=$(T), n_samples=$(mci.n_samples)" +end + +allowed_types(::Type{MonteCarloIntegration}) = cuNumeric.SUPPORTED_FLOAT_TYPES + +total_space(s::MonteCarloIntegration{T}) where {T} = s.n_samples * sizeof(T) +total_flops(s::MonteCarloIntegration) = s.n_samples + +function initialize(mci::MonteCarloIntegration{T}; mod=cuNumeric) where {T} + # Uniform samples over the integration domain [0, 10]. + x = T(10) .* mod.rand(T, mci.n_samples) + GC.gc() + return (x,) +end + +_domain_volume(mci::MonteCarloIntegration{T}) where {T} = T(10) / mci.n_samples +run!(mci::MonteCarloIntegration, x) = _domain_volume(mci) * sum(exp.(-x .^ 2)) + +# n_samples comes in as N; M is unused. +function build_benchmark(::Type{MonteCarloIntegration}, ::Type{T}, N, M) where {T} + MonteCarloIntegration{T}(; n_samples=N) +end + +register_benchmark("montecarlo", MonteCarloIntegration) diff --git a/benchmark/src/core.jl b/benchmark/src/core.jl new file mode 100644 index 00000000..db452c6e --- /dev/null +++ b/benchmark/src/core.jl @@ -0,0 +1,130 @@ +using Printf +using Statistics + +""" +- `n_warmup::Int` : Number of warmup steps. These are not timed. Intended + to avoid pre-compilation cost being timed. +- `n_iter::Int` : Number of iterations to run per trial. Should be large enough + to build up queue depth of tasks such that latency is hidden. +- `n_trial::Int` : Number of independent trials to run. Timing is restarted and + legate in between each trial. Sets number of datapoints used to estimated + standard deviations/errors. +- `n_gpu::Int` : The number of GPUs used by legate. Set through the LEGATE_CONFIG, + this value is just bookkeeping. +""" +Base.@kwdef struct GlobalSettings + n_warmup::Int # Number of warmup steps, where timing is not done. + n_iter::Int # Number of iterations to run per trial + n_trial::Int = 1 # Number of independent trials to run. Benchmark + n_gpu::Int = 0 + cupynumeric::Bool = false # also run baselines under cupynumeric for comparison + cuda::Bool = false # also run under CUDA.jl for comparison (single-GPU only) +end + +######################################### + +abstract type AbstractBenchmark{T} end + +# Interface each benchmark implements (see benchmarks/gemm.jl for a template). +function name end +function dims end +function data end +function allowed_types end +function total_flops end +function initialize end +function run! end + +# Maps a benchmarks.toml table name to its benchmark type. Each benchmark file +# registers itself via `register_benchmark`. +const BENCHMARKS = Dict{String,Type}() +function register_benchmark(key::AbstractString, ::Type{B}) where {B<:AbstractBenchmark} + BENCHMARKS[key] = B +end + +function build_benchmark(::Type{B}, ::Type{T}, N, M) where {B<:AbstractBenchmark,T} + B{T}(; N=N, M=M) +end + +######################################### + +# Per-trial timings for one benchmark. `times_ms[i]`/`gflops[i]` are the mean +# over `n_iter` iterations for trial `i`; the spread across trials gives stddev. +struct BenchmarkResult{B<:AbstractBenchmark} + times_ms::Vector{Float64} + gflops::Vector{Float64} + benchmark::B +end + +# One timed trial: warmup, then time `n_iter` iterations of `run!`. +function _trial(b::AbstractBenchmark, gs::GlobalSettings; mod=cuNumeric) + GC.gc(true) + state = initialize(b; mod=mod) + + start_time = nothing + for idx in 1:(gs.n_warmup + gs.n_iter) + if idx == gs.n_warmup + 1 + start_time = get_time_microseconds() + end + run!(b, state...) + end + total_time_μs = get_time_microseconds() - start_time + + mean_time_ms = total_time_μs / (gs.n_iter * 1e3) + gflops = total_flops(b) / (mean_time_ms * 1e6) + return mean_time_ms, gflops +end + +# Run `n_trial` independent trials and collect their per-trial measurements. +function run_benchmark(b::AbstractBenchmark, gs::GlobalSettings; mod=cuNumeric) + times_ms = Float64[] + gflops = Float64[] + for _ in 1:gs.n_trial + t, g = _trial(b, gs; mod=mod) + push!(times_ms, t) + push!(gflops, g) + end + return BenchmarkResult(times_ms, gflops, b) +end + +_std(x) = length(x) > 1 ? std(x) : 0.0 + +function save_result(br::BenchmarkResult, gpus; mod::String="cunumeric") + N, M = dims(br.benchmark) + path = joinpath(@__DIR__, "..", "results", "$(name(br.benchmark))_$(mod).csv") + mkpath(dirname(path)) + open(path, "a") do io + for trial in eachindex(br.times_ms) + @printf( + io, "%s,%d,%d,%d,%d,%.6f,%.6f\n", + mod, gpus, N, M, trial, + br.times_ms[trial], br.gflops[trial], + ) + end + end +end + +######################################### + +# `setup` runs in the worker before the benchmark is built (e.g. flip a runtime +# preference); code-path variants leave it a no-op. +# struct Variant +# name::String +# setup::Function +# end + +# const VARIANTS = Dict{String,Variant}() + +# function register_variant(name, setup=() -> nothing) +# VARIANTS[name] = Variant(name, setup) +# end + +# function variant_setup(name) +# if haskey(VARIANTS, name) +# return VARIANTS[name].setup +# end +# return () -> nothing +# end + +# register_variant("baseline") +# register_variant("fusion_off", cuNumeric.CNPreferences.disable_broadcast_fusion!) +# register_variant("fusion_on", cuNumeric.CNPreferences.enable_broadcast_fusion!) diff --git a/benchmark/src/parse_benchmarks.jl b/benchmark/src/parse_benchmarks.jl new file mode 100644 index 00000000..605c5002 --- /dev/null +++ b/benchmark/src/parse_benchmarks.jl @@ -0,0 +1,86 @@ +using TOML + +""" +One benchmark invocation parsed from `benchmarks.toml`. `name` selects the +benchmark type from `BENCHMARKS`; `T` is the element type (e.g. "Float32"); +`args` are the sizes (currently `N M`). +""" +struct BenchmarkSpec + name::String + T::String + gpus::Int + cpus::Int + args::Vector{Int} +end + +# A field may be a scalar or a list. +aslist(x) = x isa AbstractVector ? collect(x) : [x] + +# Value of a zipped field for sweep position `i`. length==1 field broadcasts. +sweep_value(field, i) = length(field) == 1 ? field[1] : field[i] + +# Number of positions in the sweep. Every multi-element field must agree on length; +# length==1 fields broadcast and don't constrain it. +function sweep_length(name, fields) + lengths = [length(field) for (_, field) in fields if length(field) > 1] + isempty(lengths) && return 1 + allequal(lengths) || error( + "benchmark '$(name)': zipped fields gpus/cpus/N/M must share one length " * + "or be scalar; got " * join(("$k=$(length(v))" for (k, v) in fields), ", "), + ) + return first(lengths) +end + +# Names of the `[[name]]` blocks in the order they appear in the file. TOML.jl +# parses into an unordered Dict, so we scan the source to preserve run order. +function declared_order(path) + order = String[] + for line in eachline(path) + header = strip(line) + startswith(header, "[[") && endswith(header, "]]") || continue + name = strip(header[3:(end - 2)]) + name in order || push!(order, name) # if not in list, push to ordered list + end + return order +end + +function parse_config(path) + raw = TOML.parsefile(path) + + g = raw["Global"] + global_settings = GlobalSettings(; + n_warmup=g["n_warmup"], n_iter=g["n_iter"], n_trial=get(g, "n_trial", 1), + cupynumeric=get(g, "cupynumeric", false), + cuda=get(g, "cuda", false), + ) + + specs = BenchmarkSpec[] + for name in declared_order(path) + entries = raw[name] + for e in entries + types = aslist(get(e, "T", "Float32")) + gpus = aslist(e["gpus"]) + cpus = aslist(e["cpus"]) + # fusion = get(e, "fusion", true) + N = aslist(e["N"]) + M = aslist(get(e, "M", 1)) + + n = sweep_length(name, ["gpus" => gpus, "cpus" => cpus, "N" => N, "M" => M]) + + for T in types, i in 1:n + push!( + specs, + BenchmarkSpec( + name, + T, + sweep_value(gpus, i), + sweep_value(cpus, i), + [sweep_value(N, i), sweep_value(M, i)], + ), + ) + end + end + end + + return global_settings, specs +end diff --git a/benchmark/src/single.jl b/benchmark/src/single.jl new file mode 100644 index 00000000..5b2fff54 --- /dev/null +++ b/benchmark/src/single.jl @@ -0,0 +1,54 @@ +# single.jl: worker that runs exactly one benchmark under one backend. Launched by +# run_benchmark.sh (dispatched from run.jl), which sets LEGATE_CONFIG before julia starts. +# Args: +# backend is "cunumeric" or "cudajl"; run.jl launches one worker per backend. + +using cuNumeric +using CUDACore +using LinearAlgebra + +include("core.jl") +const BENCHMARK_DIR = joinpath(@__DIR__, "benchmarks") +include.(filter(contains(r".jl$"), readdir(BENCHMARK_DIR; join=true))) + +# Resolve a TOML type string like "Float32" to the actual Julia type. +parse_type(s) = getfield(Base, Symbol(s))::DataType + +# mod runs the kernels; label tags stdout; save_as names the results CSV. +const BACKENDS = Dict( + "cunumeric" => (mod=cuNumeric, label="cuNumeric", save_as="cunumeric"), + "cudajl" => (mod=CUDACore, label="CUDA.jl", save_as="CUDA.jl"), +) + +function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial, backend) + haskey(BENCHMARKS, name) || error( + "No benchmark registered for '$(name)'. Known: $(join(sort(collect(keys(BENCHMARKS))), ", "))" + ) + haskey(BACKENDS, backend) || error( + "Unknown backend '$(backend)'. Known: $(join(sort(collect(keys(BACKENDS))), ", "))" + ) + bk = BACKENDS[backend] + T = parse_type(T_str) + b = build_benchmark(BENCHMARKS[name], T, N, M) + gs = GlobalSettings(; n_warmup=n_warmup, n_iter=n_iter, n_trial=n_trial) + + println( + "[$(bk.label)] $(name) benchmark ($(T)) on $(N)x$(M) for $(n_iter) " * + "iterations ($(n_warmup) warmup) x $(n_trial) trials", + ) + br = run_benchmark(b, gs; mod=bk.mod) + @printf("[%s] Mean Run Time: %.5f ± %.5f ms\n", bk.label, mean(br.times_ms), _std(br.times_ms)) + @printf("[%s] FLOPS: %.5f ± %.5f GFLOPS\n", bk.label, mean(br.gflops), _std(br.gflops)) + save_result(br, gpus; mod=bk.save_as) +end + +gpus = parse(Int, ARGS[1]) +bench_name = ARGS[2] +T_str = ARGS[3] +N = parse(Int, ARGS[4]) +M = parse(Int, ARGS[5]) +n_iter = parse(Int, ARGS[6]) +n_warmup = parse(Int, ARGS[7]) +n_trial = parse(Int, ARGS[8]) +backend = ARGS[9] +run_single(gpus, bench_name, T_str, N, M, n_iter, n_warmup, n_trial, backend) diff --git a/benchmark/src_py/benchmarks/__init__.py b/benchmark/src_py/benchmarks/__init__.py new file mode 100644 index 00000000..2eee4477 --- /dev/null +++ b/benchmark/src_py/benchmarks/__init__.py @@ -0,0 +1,8 @@ +import importlib +import pkgutil + +from core import BENCHMARKS + +# Import each module so it self-registers into BENCHMARKS. +for _info in pkgutil.iter_modules(__path__): + importlib.import_module(f"{__name__}.{_info.name}") diff --git a/benchmark/src_py/benchmarks/gemm.py b/benchmark/src_py/benchmarks/gemm.py new file mode 100644 index 00000000..b5d1a4b3 --- /dev/null +++ b/benchmark/src_py/benchmarks/gemm.py @@ -0,0 +1,29 @@ +import cupynumeric as np + +from core import register_benchmark + + +class GEMM: + name = "gemm" + + def __init__(self, T, N, M): + self.T, self.N, self.M = T, N, M + + def dims(self): + return self.N, self.M + + def total_flops(self): + return self.N * self.N * (2 * self.M - 1) + + def initialize(self): + A = np.random.rand(self.N, self.M).astype(self.T) + B = np.random.rand(self.M, self.N).astype(self.T) + C = np.zeros((self.N, self.N), dtype=self.T) + return (C, A, B) + + def run(self, state): + C, A, B = state + np.matmul(A, B, out=C) + + +register_benchmark("gemm", GEMM) diff --git a/benchmark/src_py/benchmarks/grayscott.py b/benchmark/src_py/benchmarks/grayscott.py new file mode 100644 index 00000000..a1a89e73 --- /dev/null +++ b/benchmark/src_py/benchmarks/grayscott.py @@ -0,0 +1,71 @@ +import cupynumeric as np + +from core import register_benchmark + + +class GrayScott: + name = "grayscott" + + # dt = dx/5; c_u, c_v, f, k as in grayscott.jl's GSParams defaults. + def __init__(self, T, N, M, dx=1.0, c_u=1.0, c_v=0.3, f=0.03, k=0.06): + self.T, self.N, self.M = T, N, M + self.dx = T(dx) + self.dt = T(dx / 5) + self.c_u, self.c_v, self.f, self.k = T(c_u), T(c_v), T(f), T(k) + + def dims(self): + return self.N, self.M + + def total_flops(self): + return self.N * self.M + + def initialize(self): + d = (self.N, self.M) + u = np.ones(d, dtype=self.T) + v = np.zeros(d, dtype=self.T) + u_new = np.zeros(d, dtype=self.T) + v_new = np.zeros(d, dtype=self.T) + + seed = min(150, self.N, self.M) + u[:seed, :seed] = np.random.rand(seed, seed).astype(self.T) + v[:seed, :seed] = np.random.rand(seed, seed).astype(self.T) + # mutable list so run() can swap buffers in place + return [u, v, u_new, v_new] + + def run(self, state): + u, v, u_new, v_new = state + ui = u[1:-1, 1:-1] + vi = v[1:-1, 1:-1] + + F_u = (-ui * (vi * vi)) + self.f * (1 - ui) + F_v = (ui * (vi * vi)) - (self.f + self.k) * vi + + dx2 = self.dx * self.dx + u_lap = ( + (u[2:, 1:-1] - 2 * ui + u[:-2, 1:-1]) / dx2 + + (u[1:-1, 2:] - 2 * ui + u[1:-1, :-2]) / dx2 + ) + v_lap = ( + (v[2:, 1:-1] - 2 * vi + v[:-2, 1:-1]) / dx2 + + (v[1:-1, 2:] - 2 * vi + v[1:-1, :-2]) / dx2 + ) + + u_new[1:-1, 1:-1] = (self.c_u * u_lap + F_u) * self.dt + ui + v_new[1:-1, 1:-1] = (self.c_v * v_lap + F_v) * self.dt + vi + + # periodic boundary conditions + u_new[:, 0] = u[:, -2] + u_new[:, -1] = u[:, 1] + u_new[0, :] = u[-2, :] + u_new[-1, :] = u[1, :] + v_new[:, 0] = v[:, -2] + v_new[:, -1] = v[:, 1] + v_new[0, :] = v[-2, :] + v_new[-1, :] = v[1, :] + + # swap references rather than copy + state[0], state[2] = u_new, u + state[1], state[3] = v_new, v + + +register_benchmark("grayscott_baseline", GrayScott) diff --git a/benchmark/src_py/benchmarks/montecarlo.py b/benchmark/src_py/benchmarks/montecarlo.py new file mode 100644 index 00000000..370fc7b9 --- /dev/null +++ b/benchmark/src_py/benchmarks/montecarlo.py @@ -0,0 +1,28 @@ +import cupynumeric as np + +from core import register_benchmark + + +class MonteCarlo: + name = "montecarlo" + + def __init__(self, T, N, M): + self.T = T + self.n_samples = N + + def dims(self): + return self.n_samples, 1 + + def total_flops(self): + return self.n_samples + + def initialize(self): + x = (self.T(10) * np.random.rand(self.n_samples)).astype(self.T) + return (x,) + + def run(self, state): + (x,) = state + return (self.T(10) / self.n_samples) * np.sum(np.exp(-(x * x))) + + +register_benchmark("montecarlo", MonteCarlo) diff --git a/benchmark/src_py/core.py b/benchmark/src_py/core.py new file mode 100644 index 00000000..f32a4fc3 --- /dev/null +++ b/benchmark/src_py/core.py @@ -0,0 +1,57 @@ +import os +import math + +import cupynumeric as np +from legate.timing import time # blocks on preceding legate ops; returns microseconds + +MOD = "cupynumeric" +RESULTS_DIR = os.path.join(os.path.dirname(__file__), "..", "results") + +DTYPES = {"Float32": np.float32, "Float64": np.float64} + + +def parse_type(s): + if s not in DTYPES: + raise ValueError(f"Unsupported type '{s}'. Known: {', '.join(DTYPES)}") + return DTYPES[s] + + +BENCHMARKS = {} + + +def register_benchmark(key, cls): + BENCHMARKS[key] = cls + + +def trial(bench, n_warmup, n_iter): + state = bench.initialize() + start = None + for idx in range(n_warmup + n_iter): + if idx == n_warmup: + start = time() + bench.run(state) + total_us = time() - start + + mean_time_ms = total_us / (n_iter * 1e3) + gflops = bench.total_flops() / (mean_time_ms * 1e6) + return mean_time_ms, gflops + + +def _mean(x): + return sum(x) / len(x) + + +def _std(x): + if len(x) < 2: + return 0.0 + m = _mean(x) + return math.sqrt(sum((v - m) ** 2 for v in x) / (len(x) - 1)) + + +def save_result(name, dims, gpus, times_ms, gflops): + os.makedirs(RESULTS_DIR, exist_ok=True) + N, M = dims + path = os.path.join(RESULTS_DIR, f"{name}_{MOD}.csv") + with open(path, "a") as io: + for i, (t, g) in enumerate(zip(times_ms, gflops), start=1): + io.write(f"{MOD},{gpus},{N},{M},{i},{t:.6f},{g:.6f}\n") diff --git a/benchmark/src_py/single.py b/benchmark/src_py/single.py new file mode 100644 index 00000000..005cda31 --- /dev/null +++ b/benchmark/src_py/single.py @@ -0,0 +1,48 @@ +# cupynumeric worker, run by run_benchmark.sh which sets LEGATE_CONFIG first. +# Args: +import os +import sys + +# Make `core` and the `benchmarks` package importable when run as a script. +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from core import MOD, parse_type, trial, save_result, _mean, _std +from benchmarks import BENCHMARKS # import populates BENCHMARKS + + +def main(): + gpus = int(sys.argv[1]) + name = sys.argv[2] + T_str = sys.argv[3] + N = int(sys.argv[4]) + M = int(sys.argv[5]) + n_iter = int(sys.argv[6]) + n_warmup = int(sys.argv[7]) + n_trial = int(sys.argv[8]) + + if name not in BENCHMARKS: + raise ValueError( + f"No benchmark registered for '{name}'. Known: {', '.join(sorted(BENCHMARKS))}" + ) + T = parse_type(T_str) + bench = BENCHMARKS[name](T, N, M) + + print( + f"[{MOD}] {name} benchmark ({T_str}) on {N}x{M} for {n_iter} " + f"iterations ({n_warmup} warmup) x {n_trial} trials" + ) + + times_ms, gflops = [], [] + for _ in range(n_trial): + t, g = trial(bench, n_warmup, n_iter) + times_ms.append(t) + gflops.append(g) + + print(f"[{MOD}] Mean Run Time: {_mean(times_ms):.5f} ± {_std(times_ms):.5f} ms") + print(f"[{MOD}] FLOPS: {_mean(gflops):.5f} ± {_std(gflops):.5f} GFLOPS") + + save_result(bench.name, bench.dims(), gpus, times_ms, gflops) + + +if __name__ == "__main__": + main()