Skip to content
Draft
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ logging/*
debug
debug/*

# benchmark outputs
benchmark/results
benchmark/results/*

compile_wrapper.sh

*.tar.gz
Expand Down
8 changes: 3 additions & 5 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,8 @@ name = "cuNumeric"
uuid = "0fd9ffd4-7e84-4cd0-b8f8-645bd8c73620"
version = "0.1.1"

[workspace]
projects = ["test", "dev"]

[deps]
CNPreferences = "3e078157-ea10-49d5-bf32-908f777cd46f"
CUDA_SDK_jll = "6cbf2f2e-7e60-5632-ac76-dca2274e0be0"
CxxWrap = "1f15a43c-97ca-5a2a-ae31-89f07a497df4"
JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
Legate = "1238f2cf-6593-4d60-9aca-2f5364e49909"
Expand All @@ -33,7 +29,6 @@ CUDAExt = "CUDA"
[compat]
CNPreferences = "0.1.2"
CUDA = "5.9"
CUDA_SDK_jll = "13"
CxxWrap = "0.17"
JuliaFormatter = "2.3.0"
Legate = "0.1.2"
Expand All @@ -47,3 +42,6 @@ StatsBase = "0.34"
cunumeric_jl_wrapper_jll = "25.10.3"
cupynumeric_jll = "25.10.3"
julia = "1.10"

[workspace]
projects = ["test", "dev"]
12 changes: 12 additions & 0 deletions benchmark/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[deps]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
cuNumeric = "0fd9ffd4-7e84-4cd0-b8f8-645bd8c73620"

[extras]
CNPreferences = "3e078157-ea10-49d5-bf32-908f777cd46f"
LegatePreferences = "8028f36a-2b64-49e9-aa04-2d0933fd2ed9"
53 changes: 53 additions & 0 deletions benchmark/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Benchmark configuration

Benchmarks are declared in `benchmarks.toml`. `run.jl` parses it.

## Layout

```toml
[Global]
n_warmup = 5
n_iter = 1000
n_trial = 5

[[gemm]] # name registered in src/benchmarks.jl
T = "Float32" # element type
gpus = 1
cpus = 2
N = 150
M = 150 # optional, defaults to 1
```

Repeat a `[[name]]` block to add independent configs.

## Lists

Any of `T`, `gpus`, `cpus`, `N`, `M` may be a list. They expand along
two axes:

- **`gpus`, `cpus`, `N`, `M` zip** into a single lockstep sweep — element `i`
of each is paired together.

Each zipped field must be one of:

- a scalar or single-element list (`cpus = 2` or `[2]`) -> broadcast to every config
- a list whose length equals the sweep length

Any other length mismatch is an error.

```toml
[[sgemm]]
T = ["Float64", "Float32"] # multiplies
gpus = [1, 2, 4] #
cpus = 2 # zip -> (1,2,150,150), (2,2,300,300), (4,2,600,600)
N = [150, 300, 600] #
M = [150, 300, 600] #
```

-> 2 types * 3 sweep points = **6 runs**.

### Gotcha

When `T = ["Float32", "Float64"]` and a length-2 `N`/`M` sweep you get all **4**
combinations, not a paired `Float32 -> N[1], Float64 -> N[2]`. To pin a type
to a specific size, use separate `[[name]]` blocks.
46 changes: 46 additions & 0 deletions benchmark/benchmarks.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
[Global]
n_warmup = 5
n_iter = 1000
n_trial = 5

####################################
# GEMM #
# Work ~ 2*N^2*M. Hold N, scale M. #
####################################

[[gemm]]
T = ["Float32", "Float64"]
gpus = [1, 2, 4, 8]
cpus = 2
N = 4096
M = [4096, 8192, 16384, 32768]

#################################
# Gray-Scott #
# Work ~ N*M. Hold N, scale M. #
#################################

[[grayscott_baseline]]
T = "Float32"
gpus = [1, 2, 4, 8]
cpus = 2
N = 1024
M = [1024, 2048, 4096, 8192]

[[grayscott_lifetimes]]
T = "Float32"
gpus = [1, 2, 4, 8]
cpus = 2
N = 1024
M = [1024, 2048, 4096, 8192]

#################################
# Monte-Carlo Integration #
# Work ~ N. Scale N linearly #
#################################

[[montecarlo]]
T = "Float32"
gpus = [1, 2, 4, 8]
cpus = 2
N = [1_000_000, 2_000_000, 4_000_000, 8_000_000]
63 changes: 63 additions & 0 deletions benchmark/run.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# run.jl: orchestrator. Builds one run_benchmark.sh command per benchmark and
# dispatches it; the script sets LEGATE_CONFIG (from --gpus/--cpus) before
# launching the worker (single.jl) that actually runs the benchmark.
# no args -> one command per benchmarks.toml entry
# with args -> one command from <gpus> <cpus> <name> <T> <N> <M> <iter> <warmup> <trial>

# Orchestrator stays off the GPU: it only needs GlobalSettings + parse_config,
# both cuNumeric-free. The worker (single.jl) loads cuNumeric and the kernels.
include("src/core.jl")
include("src/parse_benchmarks.jl")

const RUNNER = joinpath(@__DIR__, "run_benchmark.sh")
const WORKER = joinpath(@__DIR__, "src/single.jl")

banner(msg) = println("\n", "="^128, "\n", msg, "\n", "="^128)

function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial)
# Name validity is checked in the worker (single.jl), which owns the registry.
banner(
"$(name): T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " *
"n_iter=$(n_iter) n_warmup=$(n_warmup) n_trial=$(n_trial)",
)

cmd = `bash $RUNNER $WORKER --gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial`
try
run(cmd)
catch e
@error "Benchmark '$(name)' failed; continuing." exception = e
end
end

function run_all_benchmarks(config="benchmarks.toml")
gs, specs = parse_config(joinpath(@__DIR__, config))
for spec in specs
N, M = spec.args
dispatch(;
gpus=spec.gpus,
cpus=spec.cpus,
name=spec.name,
T=spec.T,
N=N, M=M,
n_iter=gs.n_iter,
n_warmup=gs.n_warmup,
n_trial=gs.n_trial,
)
end
end

if isempty(ARGS)
run_all_benchmarks()
else # dispatch on args
dispatch(;
gpus=parse(Int, ARGS[1]),
cpus=parse(Int, ARGS[2]),
name=ARGS[3],
T=ARGS[4],
N=parse(Int, ARGS[5]),
M=parse(Int, ARGS[6]),
n_iter=parse(Int, ARGS[7]),
n_warmup=parse(Int, ARGS[8]),
n_trial=parse(Int, ARGS[9]),
)
end
12 changes: 8 additions & 4 deletions benchmark/run_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,21 @@ if [[ $GPUS -lt 0 ]]; then
fi

if [[ $CPUS -lt 0 ]]; then
echo "CPUs ivnalid, using cpus = 1"
echo "CPUs invalid, using cpus = 1"
exit
fi

export LEGATE_AUTO_CONFIG=0
export LEGATE_CONFIG="--cpus=1 --gpus=$GPUS --omps=$CPUS --ompthreads=3 --utility=2 --sysmem=256 --numamem=19029 --fbmem=7569 --zcmem=128 --regmem=0"
export LEGATE_AUTO_CONFIG=1
export LEGATE_CONFIG="--cpus=$CPUS --gpus=$GPUS"
export LEGATE_SHOW_CONFIG=1

export LD_LIBRARY_PATH=""

echo "Running $FILENAME with $CPUS CPUs and $GPUS GPUs"

CMD="julia --project='..' $FILENAME $GPUS ${EXTRA_ARGS[@]}"
eval "julia --project -e 'using Pkg; Pkg.develop(path=\"..\"); Pkg.instantiate()'"

CMD="julia --project $FILENAME $GPUS ${EXTRA_ARGS[@]}"

printf "Running: %s\n" "$CMD"
eval "$CMD"
56 changes: 0 additions & 56 deletions benchmark/sgemm.jl

This file was deleted.

27 changes: 27 additions & 0 deletions benchmark/src/benchmarks/gemm.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
Base.@kwdef struct GEMM{T} <: AbstractBenchmark{T}
N::Int
M::Int
end

name(::GEMM) = "gemm"
dims(g::GEMM) = (g.N, g.M)
data(g::GEMM{T}) where {T} = "GEMM with T=$(T), N=$(g.N), M=$(g.M)"

function allowed_types(::Type{GEMM})
Union{cuNumeric.SUPPORTED_FLOAT_TYPES,cuNumeric.SUPPORTED_INT_TYPES}
end

total_flops(s::GEMM) = s.N * s.N * ((2*s.M) - 1)
total_space(s::GEMM{T}) where {T} = 2 * ((s.N*s.M) * sizeof(T)) + ((s.N*s.N) * sizeof(T))

function initialize(s::GEMM{T}; mod=cuNumeric) where {T}
A = mod.rand(T, s.N, s.M)
B = mod.rand(T, s.M, s.N)
C = mod.zeros(T, s.N, s.N)
GC.gc()
return C, A, B
end

run!(::GEMM, C, A, B) = mul!(C, A, B)

register_benchmark("gemm", GEMM)
Loading
Loading