JuliaLegate · ejmeitz · Jun 1, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/.gitignore b/.gitignore
@@ -14,6 +14,10 @@ logging/*
 debug
 debug/*
 
+# benchmark outputs
+benchmark/results
+benchmark/results/*
+
 compile_wrapper.sh
 
 *.tar.gz

diff --git a/Project.toml b/Project.toml
@@ -2,12 +2,8 @@ name = "cuNumeric"
 uuid = "0fd9ffd4-7e84-4cd0-b8f8-645bd8c73620"
 version = "0.1.1"
 
-[workspace]
-projects = ["test", "dev"]
-
 [deps]
 CNPreferences = "3e078157-ea10-49d5-bf32-908f777cd46f"
-CUDA_SDK_jll = "6cbf2f2e-7e60-5632-ac76-dca2274e0be0"
 CxxWrap = "1f15a43c-97ca-5a2a-ae31-89f07a497df4"
 JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
 Legate = "1238f2cf-6593-4d60-9aca-2f5364e49909"
@@ -33,7 +29,6 @@ CUDAExt = "CUDA"
 [compat]
 CNPreferences = "0.1.2"
 CUDA = "5.9"
-CUDA_SDK_jll = "13"
 CxxWrap = "0.17"
 JuliaFormatter = "2.3.0"
 Legate = "0.1.2"
@@ -47,3 +42,6 @@ StatsBase = "0.34"
 cunumeric_jl_wrapper_jll = "25.10.3"
 cupynumeric_jll = "25.10.3"
 julia = "1.10"
+
+[workspace]
+projects = ["test", "dev"]
diff --git a/benchmark/Project.toml b/benchmark/Project.toml
@@ -0,0 +1,12 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
+cuNumeric = "0fd9ffd4-7e84-4cd0-b8f8-645bd8c73620"
+
+[extras]
+CNPreferences = "3e078157-ea10-49d5-bf32-908f777cd46f"
+LegatePreferences = "8028f36a-2b64-49e9-aa04-2d0933fd2ed9"
diff --git a/benchmark/README.md b/benchmark/README.md
@@ -0,0 +1,53 @@
+# Benchmark configuration
+
+Benchmarks are declared in `benchmarks.toml`. `run.jl` parses it.
+
+## Layout
+
+```toml
+[Global]
+n_warmup = 5
+n_iter   = 1000
+n_trial  = 5
+
+[[gemm]]            # name registered in src/benchmarks.jl
+T    = "Float32"     # element type
+gpus = 1
+cpus = 2
+N    = 150
+M    = 150           # optional, defaults to 1
+```
+
+Repeat a `[[name]]` block to add independent configs.
+
+## Lists
+
+Any of `T`, `gpus`, `cpus`, `N`, `M` may be a list. They expand along
+two axes:
+
+- **`gpus`, `cpus`, `N`, `M` zip** into a single lockstep sweep — element `i`
+  of each is paired together.
+
+Each zipped field must be one of:
+
+- a scalar or single-element list (`cpus = 2` or `[2]`) -> broadcast to every config
+- a list whose length equals the sweep length
+
+Any other length mismatch is an error.
+
+```toml
+[[sgemm]]
+T    = ["Float64", "Float32"]   # multiplies
+gpus = [1, 2, 4]                #
+cpus = 2                        # zip -> (1,2,150,150), (2,2,300,300), (4,2,600,600)
+N    = [150, 300, 600]          #
+M    = [150, 300, 600]          #
+```
+
+-> 2 types * 3 sweep points = **6 runs**.
+
+### Gotcha
+
+When `T = ["Float32", "Float64"]` and a length-2 `N`/`M` sweep you get all **4**
+combinations, not a paired `Float32 -> N[1], Float64 -> N[2]`. To pin a type
+to a specific size, use separate `[[name]]` blocks.
diff --git a/benchmark/benchmarks.toml b/benchmark/benchmarks.toml
@@ -0,0 +1,46 @@
+[Global]
+n_warmup = 5
+n_iter = 1000
+n_trial = 5
+
+####################################
+#             GEMM                 #
+# Work ~ 2*N^2*M. Hold N, scale M. #
+####################################
+
+[[gemm]]
+T = ["Float32", "Float64"]
+gpus = [1, 2, 4, 8]
+cpus = 2
+N = 4096
+M = [4096, 8192, 16384, 32768]
+
+#################################
+#         Gray-Scott            #
+#  Work ~ N*M. Hold N, scale M. #
+#################################
+
+[[grayscott_baseline]]
+T = "Float32"
+gpus = [1, 2, 4, 8]
+cpus = 2
+N = 1024
+M = [1024, 2048, 4096, 8192]
+
+[[grayscott_lifetimes]]
+T = "Float32"
+gpus = [1, 2, 4, 8]
+cpus = 2
+N = 1024
+M = [1024, 2048, 4096, 8192]
+
+#################################
+#   Monte-Carlo Integration     #
+#  Work ~ N. Scale N linearly   #
+#################################
+
+[[montecarlo]]
+T = "Float32"
+gpus = [1, 2, 4, 8]
+cpus = 2
+N = [1_000_000, 2_000_000, 4_000_000, 8_000_000]
diff --git a/benchmark/run.jl b/benchmark/run.jl
@@ -0,0 +1,63 @@
+# run.jl: orchestrator. Builds one run_benchmark.sh command per benchmark and
+# dispatches it; the script sets LEGATE_CONFIG (from --gpus/--cpus) before
+# launching the worker (single.jl) that actually runs the benchmark.
+#   no args   -> one command per benchmarks.toml entry
+#   with args -> one command from <gpus> <cpus> <name> <T> <N> <M> <iter> <warmup> <trial>
+
+# Orchestrator stays off the GPU: it only needs GlobalSettings + parse_config,
+# both cuNumeric-free. The worker (single.jl) loads cuNumeric and the kernels.
+include("src/core.jl")
+include("src/parse_benchmarks.jl")
+
+const RUNNER = joinpath(@__DIR__, "run_benchmark.sh")
+const WORKER = joinpath(@__DIR__, "src/single.jl")
+
+banner(msg) = println("\n", "="^128, "\n", msg, "\n", "="^128)
+
+function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial)
+    # Name validity is checked in the worker (single.jl), which owns the registry.
+    banner(
+        "$(name): T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " *
+        "n_iter=$(n_iter) n_warmup=$(n_warmup) n_trial=$(n_trial)",
+    )
+
+    cmd = `bash $RUNNER $WORKER --gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial`
+    try
+        run(cmd)
+    catch e
+        @error "Benchmark '$(name)' failed; continuing." exception = e
+    end
+end
+
+function run_all_benchmarks(config="benchmarks.toml")
+    gs, specs = parse_config(joinpath(@__DIR__, config))
+    for spec in specs
+        N, M = spec.args
+        dispatch(;
+            gpus=spec.gpus,
+            cpus=spec.cpus,
+            name=spec.name,
+            T=spec.T,
+            N=N, M=M,
+            n_iter=gs.n_iter,
+            n_warmup=gs.n_warmup,
+            n_trial=gs.n_trial,
+        )
+    end
+end
+
+if isempty(ARGS)
+    run_all_benchmarks()
+else # dispatch on args
+    dispatch(;
+        gpus=parse(Int, ARGS[1]),
+        cpus=parse(Int, ARGS[2]),
+        name=ARGS[3],
+        T=ARGS[4],
+        N=parse(Int, ARGS[5]),
+        M=parse(Int, ARGS[6]),
+        n_iter=parse(Int, ARGS[7]),
+        n_warmup=parse(Int, ARGS[8]),
+        n_trial=parse(Int, ARGS[9]),
+    )
+end
diff --git a/benchmark/run_benchmark.sh b/benchmark/run_benchmark.sh
@@ -43,17 +43,21 @@ if [[ $GPUS -lt 0 ]]; then
 fi
 
 if [[ $CPUS -lt 0 ]]; then
-    echo "CPUs ivnalid, using cpus = 1"
+    echo "CPUs invalid, using cpus = 1"
     exit
 fi
 
-export LEGATE_AUTO_CONFIG=0
-export LEGATE_CONFIG="--cpus=1 --gpus=$GPUS --omps=$CPUS --ompthreads=3 --utility=2 --sysmem=256 --numamem=19029 --fbmem=7569 --zcmem=128 --regmem=0"
+export LEGATE_AUTO_CONFIG=1
+export LEGATE_CONFIG="--cpus=$CPUS --gpus=$GPUS"
 export LEGATE_SHOW_CONFIG=1
 
+export LD_LIBRARY_PATH=""
+
 echo "Running $FILENAME with $CPUS CPUs and $GPUS GPUs"
 
-CMD="julia --project='..' $FILENAME $GPUS ${EXTRA_ARGS[@]}"
+eval "julia --project -e 'using Pkg; Pkg.develop(path=\"..\"); Pkg.instantiate()'"
+
+CMD="julia --project $FILENAME $GPUS ${EXTRA_ARGS[@]}"
 
 printf "Running: %s\n" "$CMD"
 eval "$CMD"
diff --git a/benchmark/sgemm.jl b/benchmark/sgemm.jl
diff --git a/benchmark/src/benchmarks/gemm.jl b/benchmark/src/benchmarks/gemm.jl
@@ -0,0 +1,27 @@
+Base.@kwdef struct GEMM{T} <: AbstractBenchmark{T}
+    N::Int
+    M::Int
+end
+
+name(::GEMM) = "gemm"
+dims(g::GEMM) = (g.N, g.M)
+data(g::GEMM{T}) where {T} = "GEMM with T=$(T), N=$(g.N), M=$(g.M)"
+
+function allowed_types(::Type{GEMM})
+    Union{cuNumeric.SUPPORTED_FLOAT_TYPES,cuNumeric.SUPPORTED_INT_TYPES}
+end
+
+total_flops(s::GEMM) = s.N * s.N * ((2*s.M) - 1)
+total_space(s::GEMM{T}) where {T} = 2 * ((s.N*s.M) * sizeof(T)) + ((s.N*s.N) * sizeof(T))
+
+function initialize(s::GEMM{T}; mod=cuNumeric) where {T}
+    A = mod.rand(T, s.N, s.M)
+    B = mod.rand(T, s.M, s.N)
+    C = mod.zeros(T, s.N, s.N)
+    GC.gc()
+    return C, A, B
+end
+
+run!(::GEMM, C, A, B) = mul!(C, A, B)
+
+register_benchmark("gemm", GEMM)