diff --git a/.gitignore b/.gitignore
index 3b09cfb4..29f3d39d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,9 @@ help.sh
 docker.log
 docs/package-lock.json
 
+__pycache__
+*.pyc
+
 # auto-generated script
 build_wrapper.sh
 
@@ -14,6 +17,10 @@ logging/*
 debug
 debug/*
 
+# benchmark outputs
+benchmark/results
+benchmark/results/*
+
 compile_wrapper.sh
 
 *.tar.gz
diff --git a/Project.toml b/Project.toml
index c898d6b4..feca11f0 100644
--- a/Project.toml
+++ b/Project.toml
@@ -2,12 +2,8 @@ name = "cuNumeric"
 uuid = "0fd9ffd4-7e84-4cd0-b8f8-645bd8c73620"
 version = "0.1.1"
 
-[workspace]
-projects = ["test", "dev"]
-
 [deps]
 CNPreferences = "3e078157-ea10-49d5-bf32-908f777cd46f"
-CUDA_SDK_jll = "6cbf2f2e-7e60-5632-ac76-dca2274e0be0"
 CxxWrap = "1f15a43c-97ca-5a2a-ae31-89f07a497df4"
 JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
 Legate = "1238f2cf-6593-4d60-9aca-2f5364e49909"
@@ -33,7 +29,6 @@ CUDAExt = "CUDA"
 [compat]
 CNPreferences = "0.1.2"
 CUDA = "5.9"
-CUDA_SDK_jll = "13"
 CxxWrap = "0.17"
 JuliaFormatter = "2.3.0"
 Legate = "0.1.2"
@@ -47,3 +42,6 @@ StatsBase = "0.34"
 cunumeric_jl_wrapper_jll = "25.10.3"
 cupynumeric_jll = "25.10.3"
 julia = "1.10"
+
+[workspace]
+projects = ["test", "dev"]
diff --git a/benchmark/Project.toml b/benchmark/Project.toml
new file mode 100644
index 00000000..62eb4c27
--- /dev/null
+++ b/benchmark/Project.toml
@@ -0,0 +1,12 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CUDACore = "bd0ed864-bdfe-4181-a5ed-ce625a5fdea2"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
+cuNumeric = "0fd9ffd4-7e84-4cd0-b8f8-645bd8c73620"
+
+[extras]
+CNPreferences = "3e078157-ea10-49d5-bf32-908f777cd46f"
+LegatePreferences = "8028f36a-2b64-49e9-aa04-2d0933fd2ed9"
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 00000000..753a0416
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,78 @@
+# Benchmark configuration
+
+Benchmarks are declared in `benchmarks.toml`. `run.jl` parses it.
+
+## Running
+
+```bash
+julia --project run.jl     # runs whatever benchmarks.toml configures
+```
+
+`run.jl` runs each (benchmark, backend) pair in its own process via
+`run_benchmark.sh`, so backends never share a GPU/runtime within a measurement.
+cuNumeric always runs; extra comparison backends are toggled in `[Global]`:
+
+- `cuda = true` → also run under CUDA.jl (single-GPU configs only; CUDA.jl is
+  single-device).
+- `cupynumeric = true` → also run under cupynumeric (see below).
+
+### Comparing against cupynumeric
+
+cupynumeric runs in a conda env whose major.minor matches this project's
+resolved `cupynumeric_jll`. Build it once:
+
+```bash
+./install_cupynumeric.sh   # creates env cupynumeric-bench-<major.minor>
+```
+
+`run.jl` derives the env name automatically; override it with `CUPYNUMERIC_ENV`.
+
+## Layout
+
+```toml
+[Global]
+n_warmup = 5
+n_iter   = 1000
+n_trial  = 5
+
+[[gemm]]            # name registered in src/benchmarks.jl
+T    = "Float32"     # element type
+gpus = 1
+cpus = 2
+N    = 150
+M    = 150           # optional, defaults to 1
+```
+
+Repeat a `[[name]]` block to add independent configs.
+
+## Lists
+
+Any of `T`, `gpus`, `cpus`, `N`, `M` may be a list. They expand along
+two axes:
+- **`T` multiply.** The whole sweep runs once per type.
+- **`gpus`, `cpus`, `N`, `M` zip** into a single lockstep sweep — element `i`
+  of each is paired together.
+
+Each zipped field must be one of:
+
+- a scalar or single-element list (`cpus = 2` or `[2]`) -> broadcast to every config
+- a list whose length equals the sweep length
+
+Any other length mismatch is an error.
+
+```toml
+[[sgemm]]
+T    = ["Float64", "Float32"]   # multiplies
+gpus = [1, 2, 4]                #
+cpus = 2                        # zip -> (1,2,150,150), (2,2,300,300), (4,2,600,600)
+N    = [150, 300, 600]          #
+M    = [150, 300, 600]          #
+```
+
+-> 2 types * 3 sweep points = **6 runs**.
+
+### Gotcha
+
+When `T = ["Float32", "Float64"]` and a length-2 `N`/`M` sweep you get all **4**
+combinations, not a paired `Float32 -> N[1], Float64 -> N[2]`. To pin a type
+to a specific size, use separate `[[name]]` blocks.
diff --git a/benchmark/benchmarks.toml b/benchmark/benchmarks.toml
new file mode 100644
index 00000000..688ae9e2
--- /dev/null
+++ b/benchmark/benchmarks.toml
@@ -0,0 +1,48 @@
+[Global]
+n_warmup = 5
+n_iter = 1000
+n_trial = 5
+cupynumeric = true # (needs install_cupynumeric.sh)
+cuda = false # compare against CUDA.jl (single-GPU configs only)
+
+####################################
+#             GEMM                 #
+# Work ~ 2*N^2*M. Hold N, scale M. #
+####################################
+
+[[gemm]]
+T = ["Float32", "Float64"]
+gpus = [1, 2, 4, 8]
+cpus = 2
+N = 4096
+M = [4096, 8192, 16384, 32768]
+
+#################################
+#         Gray-Scott            #
+#  Work ~ N*M. Hold N, scale M. #
+#################################
+
+[[grayscott_baseline]]
+T = "Float32"
+gpus = [1, 2, 4, 8]
+cpus = 2
+N = 1024
+M = [1024, 2048, 4096, 8192]
+
+[[grayscott_lifetimes]]
+T = "Float32"
+gpus = [1, 2, 4, 8]
+cpus = 2
+N = 1024
+M = [1024, 2048, 4096, 8192]
+
+#################################
+#   Monte-Carlo Integration     #
+#  Work ~ N. Scale N linearly   #
+#################################
+
+[[montecarlo]]
+T = "Float32"
+gpus = [1, 2, 4, 8]
+cpus = 2
+N = [1_000_000, 2_000_000, 4_000_000, 8_000_000]
diff --git a/benchmark/install_cupynumeric.sh b/benchmark/install_cupynumeric.sh
new file mode 100755
index 00000000..541a654c
--- /dev/null
+++ b/benchmark/install_cupynumeric.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+# Install a cupynumeric conda env matching the cupynumeric_jll our project resolves.
+# The conda package and the JLL share the calendar-versioning scheme (e.g. 25.10),
+# so we pin major.minor (patch ignored) and install from the legate channel.
+#
+# Usage:
+#   ./install_cupynumeric.sh                 # create a fresh env named cupynumeric-bench-<ver>
+#   ./install_cupynumeric.sh --name myenv    # override the env name
+#   ./install_cupynumeric.sh --into existing # install into an existing env instead of creating one
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+ENV_NAME=""
+INTO_ENV=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --name)
+            ENV_NAME=$2
+            shift 2
+            ;;
+        --into)
+            INTO_ENV=$2
+            shift 2
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            echo "Usage: $0 [--name <env>] [--into <existing-env>]"
+            exit 1
+            ;;
+    esac
+done
+
+# Resolve the JLL version Julia actually instantiated for this project, then keep
+# major.minor only — conda packages are not published per patch.
+echo "Detecting cupynumeric_jll version from the benchmark project..."
+VER=$(cd "$SCRIPT_DIR" && julia --project -e '
+using Pkg
+for (_, info) in Pkg.dependencies()
+    info.name == "cupynumeric_jll" || continue
+    v = info.version
+    isnothing(v) && continue
+    println("$(v.major).$(v.minor)")
+end' | tail -1)
+
+if [[ -z "$VER" ]]; then
+    echo "Error: could not detect cupynumeric_jll version. Has the project been instantiated?"
+    exit 1
+fi
+
+echo "cupynumeric_jll major.minor: $VER"
+SPEC="cupynumeric=$VER.*"
+
+if [[ -n "$INTO_ENV" ]]; then
+    echo "Installing $SPEC into existing env '$INTO_ENV'..."
+    conda install -y -n "$INTO_ENV" -c conda-forge -c legate "$SPEC"
+    echo "Done. Activate with: conda activate $INTO_ENV"
+    exit 0
+fi
+
+[[ -z "$ENV_NAME" ]] && ENV_NAME="cupynumeric-bench-$VER"
+
+if conda env list | awk '{print $1}' | grep -qx "$ENV_NAME"; then
+    echo "Env '$ENV_NAME' already exists with $SPEC; nothing to do."
+    echo "Activate with: conda activate $ENV_NAME"
+    exit 0
+fi
+
+echo "Creating env '$ENV_NAME' with $SPEC..."
+conda create -y -n "$ENV_NAME" -c conda-forge -c legate "$SPEC"
+
+echo "Done. Activate with: conda activate $ENV_NAME"
diff --git a/benchmark/run.jl b/benchmark/run.jl
new file mode 100644
index 00000000..7d6c3bb4
--- /dev/null
+++ b/benchmark/run.jl
@@ -0,0 +1,103 @@
+# run.jl: orchestrator. Builds one run_benchmark.sh command per benchmark and
+# dispatches it; the script sets LEGATE_CONFIG (from --gpus/--cpus) before
+# launching the worker (single.jl) that actually runs the benchmark.
+#   no args   -> one command per benchmarks.toml entry
+#   with args -> one command from <gpus> <cpus> <name> <T> <N> <M> <iter> <warmup> <trial>
+
+# Orchestrator stays off the GPU: it only needs GlobalSettings + parse_config,
+# both cuNumeric-free. The worker (single.jl) loads cuNumeric and the kernels.
+
+using Pkg
+
+include("src/core.jl")
+include("src/parse_benchmarks.jl")
+
+const RUNNER = joinpath(@__DIR__, "run_benchmark.sh")
+const WORKER = joinpath(@__DIR__, "src/single.jl")
+const PY_WORKER = joinpath(@__DIR__, "src_py/single.py")
+
+banner(msg) = println("\n", "="^128, "\n", msg, "\n", "="^128)
+
+# `_lifetimes` is a cuNumeric-only code-path variant (@analyze_lifetimes)
+cunumeric_only(name) = endswith(name, "_lifetimes")
+
+# ensure things are resolved and devlop'd properly
+function ensure_project_ready()
+    Pkg.develop(; path=joinpath(@__DIR__, ".."))
+    Pkg.instantiate()
+end
+
+# default env name mirrors install_cupynumeric.sh: cupynumeric-bench-<major>.<minor>
+# CUPYNUMERIC_ENV overrides it.
+function cupynumeric_env_name()
+    haskey(ENV, "CUPYNUMERIC_ENV") && return ENV["CUPYNUMERIC_ENV"]
+    for (_, info) in Pkg.dependencies()
+        info.name == "cupynumeric_jll" || continue
+        info.version === nothing && continue
+        return "cupynumeric-bench-$(info.version.major).$(info.version.minor)"
+    end
+    error("could not resolve cupynumeric_jll version; set CUPYNUMERIC_ENV explicitly")
+end
+
+function dispatch(; gpus, cpus, name, T, N, M, n_iter, n_warmup, n_trial,
+    cupynumeric=false, cudajl=false)
+    banner(
+        "$(name): T=$(T) gpus=$(gpus) cpus=$(cpus) N=$(N) M=$(M) " *
+        "n_iter=$(n_iter) n_warmup=$(n_warmup) n_trial=$(n_trial)",
+    )
+
+    # each backend runs in its own worker process
+    args = `--gpus $gpus --cpus $cpus $name $T $N $M $n_iter $n_warmup $n_trial`
+    cmds = [`bash $RUNNER $WORKER $args cunumeric`]
+    # CUDA.jl is single-GPU only
+    if cudajl && gpus == 1 && !cunumeric_only(name)
+        push!(cmds, `bash $RUNNER $WORKER $args cudajl`)
+    end
+    if cupynumeric && !cunumeric_only(name)
+        push!(cmds, `bash $RUNNER $PY_WORKER --pyenv $(cupynumeric_env_name()) $args`)
+    end
+
+    for cmd in cmds
+        try
+            run(cmd)
+        catch e
+            @error "Benchmark '$(name)' failed; continuing." exception = e
+        end
+    end
+end
+
+function run_all_benchmarks(config="benchmarks.toml")
+    gs, specs = parse_config(joinpath(@__DIR__, config))
+    for spec in specs
+        N, M = spec.args
+        dispatch(;
+            gpus=spec.gpus,
+            cpus=spec.cpus,
+            name=spec.name,
+            T=spec.T,
+            N=N, M=M,
+            n_iter=gs.n_iter,
+            n_warmup=gs.n_warmup,
+            n_trial=gs.n_trial,
+            cupynumeric=gs.cupynumeric,
+            cudajl=gs.cuda,
+        )
+    end
+end
+
+ensure_project_ready()
+if isempty(ARGS)
+    run_all_benchmarks()
+else # dispatch on args
+    dispatch(;
+        gpus=parse(Int, ARGS[1]),
+        cpus=parse(Int, ARGS[2]),
+        name=ARGS[3],
+        T=ARGS[4],
+        N=parse(Int, ARGS[5]),
+        M=parse(Int, ARGS[6]),
+        n_iter=parse(Int, ARGS[7]),
+        n_warmup=parse(Int, ARGS[8]),
+        n_trial=parse(Int, ARGS[9]),
+    )
+end
diff --git a/benchmark/run_benchmark.sh b/benchmark/run_benchmark.sh
index 07a97a05..b802f7bc 100755
--- a/benchmark/run_benchmark.sh
+++ b/benchmark/run_benchmark.sh
@@ -11,6 +11,7 @@ shift
 
 GPUS=0
 CPUS=1
+PYENV=""
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -22,6 +23,10 @@ while [[ $# -gt 0 ]]; do
             CPUS=$2
             shift 2
             ;;
+        --pyenv)
+            PYENV=$2
+            shift 2
+            ;;
         *)
             # Collect all other arguments as extra arguments
             EXTRA_ARGS+=("$1")
@@ -43,17 +48,29 @@ if [[ $GPUS -lt 0 ]]; then
 fi
 
 if [[ $CPUS -lt 0 ]]; then
-    echo "CPUs ivnalid, using cpus = 1"
+    echo "CPUs invalid, using cpus = 1"
     exit
 fi
 
-export LEGATE_AUTO_CONFIG=0
-export LEGATE_CONFIG="--cpus=1 --gpus=$GPUS --omps=$CPUS --ompthreads=3 --utility=2 --sysmem=256 --numamem=19029 --fbmem=7569 --zcmem=128 --regmem=0"
+export LEGATE_AUTO_CONFIG=1
+export LEGATE_CONFIG="--cpus=$CPUS --gpus=$GPUS"
 export LEGATE_SHOW_CONFIG=1
 
+export LD_LIBRARY_PATH=""
+
 echo "Running $FILENAME with $CPUS CPUs and $GPUS GPUs"
 
-CMD="julia --project='..' $FILENAME $GPUS ${EXTRA_ARGS[@]}"
+# Python (cupynumeric) workers run in the conda env built by install_cupynumeric.sh;
+# Julia (cuNumeric) workers run against the local project.
+if [[ $FILENAME == *.py ]]; then
+    if [[ -z $PYENV ]]; then
+        echo "Error: running a .py worker requires --pyenv <conda-env> (run install_cupynumeric.sh first)."
+        exit 1
+    fi
+    CMD="conda run --no-capture-output -n $PYENV python $FILENAME $GPUS ${EXTRA_ARGS[@]}"
+else
+    CMD="julia --project $FILENAME $GPUS ${EXTRA_ARGS[@]}"
+fi
 
 printf "Running: %s\n" "$CMD"
 eval "$CMD"
diff --git a/benchmark/sgemm.jl b/benchmark/sgemm.jl
deleted file mode 100644
index 28d9ad7c..00000000
--- a/benchmark/sgemm.jl
+++ /dev/null
@@ -1,56 +0,0 @@
-using cuNumeric
-using LinearAlgebra
-using Printf
-
-function initialize_cunumeric(N, M)
-    A = cuNumeric.as_type(cuNumeric.rand(NDArray, N, M), Float32)
-    B = cuNumeric.as_type(cuNumeric.rand(NDArray, M, N), Float32)
-    C = cuNumeric.zeros(Float32, N, N)
-    GC.gc() # remove the intermediate FP64 arrays
-    return A, B, C
-end
-
-function total_flops(N, M)
-    return N * N * ((2*M) - 1)
-end
-
-function total_space(N, M)
-    return 2 * (N*M) * sizeof(Float32) + (N*N) * sizeof(Float32)
-end
-
-function gemm_cunumeric(N, M, n_samples, n_warmup)
-    A, B, C = initialize_cunumeric(N, M)
-
-    start_time = nothing
-    for idx in range(1, n_samples + n_warmup)
-        if idx == n_warmup + 1
-            start_time = get_time_microseconds()
-        end
-
-        mul!(C, A, B)
-    end
-    total_time_μs = get_time_microseconds() - start_time
-    mean_time_ms = total_time_μs / (n_samples * 1e3)
-    gflops = total_flops(N, M) / (mean_time_ms * 1e6) # GFLOP is 1e9
-
-    return mean_time_ms, gflops
-end
-
-gpus = parse(Int, ARGS[1])
-N = parse(Int, ARGS[2])
-M = parse(Int, ARGS[3])
-n_samples = parse(Int, ARGS[4])
-n_warmup = parse(Int, ARGS[5])
-
-println(
-    "[cuNumeric]  MATMUL benchmark on $(N)x$(M) matricies for $(n_samples) iterations, $(n_warmup) warmups"
-)
-
-mean_time_ms, gflops = gemm_cunumeric(N, M, n_samples, n_warmup)
-
-println("[cuNumeric]  Mean Run Time: $(mean_time_ms) ms")
-println("[cuNumeric]  FLOPS: $(gflops) GFLOPS")
-
-open("./gemm.csv", "a") do io
-    @printf(io, "%s,%d,%d,%d,%.6f,%.6f\n", "cunumeric", gpus, N, M, mean_time_ms, gflops)
-end
diff --git a/benchmark/src/benchmarks/gemm.jl b/benchmark/src/benchmarks/gemm.jl
new file mode 100644
index 00000000..a4356792
--- /dev/null
+++ b/benchmark/src/benchmarks/gemm.jl
@@ -0,0 +1,27 @@
+Base.@kwdef struct GEMM{T} <: AbstractBenchmark{T}
+    N::Int
+    M::Int
+end
+
+name(::GEMM) = "gemm"
+dims(g::GEMM) = (g.N, g.M)
+data(g::GEMM{T}) where {T} = "GEMM with T=$(T), N=$(g.N), M=$(g.M)"
+
+function allowed_types(::Type{GEMM})
+    Union{cuNumeric.SUPPORTED_FLOAT_TYPES,cuNumeric.SUPPORTED_INT_TYPES}
+end
+
+total_flops(s::GEMM) = s.N * s.N * ((2*s.M) - 1)
+total_space(s::GEMM{T}) where {T} = 2 * ((s.N*s.M) * sizeof(T)) + ((s.N*s.N) * sizeof(T))
+
+function initialize(s::GEMM{T}; mod=cuNumeric) where {T}
+    A = mod.rand(T, s.N, s.M)
+    B = mod.rand(T, s.M, s.N)
+    C = mod.zeros(T, s.N, s.N)
+    GC.gc()
+    return C, A, B
+end
+
+run!(::GEMM, C, A, B) = mul!(C, A, B)
+
+register_benchmark("gemm", GEMM)
diff --git a/benchmark/src/benchmarks/grayscott.jl b/benchmark/src/benchmarks/grayscott.jl
new file mode 100644
index 00000000..a2d51315
--- /dev/null
+++ b/benchmark/src/benchmarks/grayscott.jl
@@ -0,0 +1,126 @@
+struct GSParams{T}
+    dx::T
+    dt::T
+    c_u::T
+    c_v::T
+    f::T
+    k::T
+end
+
+function GSParams{T}(; dx=1, c_u=1.0, c_v=0.3, f=0.03, k=0.06) where {T}
+    GSParams{T}(T(dx), T(dx / 5), T(c_u), T(c_v), T(f), T(k))
+end
+
+abstract type AbstractGrayScott{T} <: AbstractBenchmark{T} end
+
+Base.@kwdef struct GrayScottBaseline{T} <: AbstractGrayScott{T}
+    N::Int
+    M::Int
+end
+
+Base.@kwdef struct GrayScottLifetimes{T} <: AbstractGrayScott{T}
+    N::Int
+    M::Int
+end
+
+name(::AbstractGrayScott) = "grayscott"
+dims(b::AbstractGrayScott) = (b.N, b.M)
+data(b::AbstractGrayScott{T}) where {T} = "GrayScott with T=$(T), N=$(b.N), M=$(b.M)"
+allowed_types(::Type{AbstractGrayScott}) = cuNumeric.SUPPORTED_FLOAT_TYPES
+total_flops(b::AbstractGrayScott) = b.N * b.M # grid points updated per step
+
+function build_benchmark(::Type{A}, ::Type{T}, N, M) where {A<:AbstractGrayScott,T}
+    A{T}(; N=N, M=M)
+end
+
+mutable struct GrayScottState{A,P}
+    u::A
+    v::A
+    u_new::A
+    v_new::A
+    params::P
+end
+
+function initialize(b::AbstractGrayScott{T}; mod=cuNumeric) where {T}
+    d = (b.N, b.M)
+    u = mod.ones(T, d)
+    v = mod.zeros(T, d)
+    u_new = mod.zeros(T, d)
+    v_new = mod.zeros(T, d)
+
+    seed = min(150, b.N, b.M)
+    u[1:seed, 1:seed] = mod.rand(T, (seed, seed))
+    v[1:seed, 1:seed] = mod.rand(T, (seed, seed))
+
+    return (GrayScottState(u, v, u_new, v_new, GSParams{T}()),)
+end
+
+# VARIANT DESCRIPTION
+# baseline: as written
+# lifetimes: step wrapped in @analyze_lifetimes
+let body = quote
+        # currently we don't have NDArray^x working yet.
+        F_u = (
+            (
+                -u[2:(end - 1), 2:(end - 1)] .*
+                (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)])
+            ) + args.f * (1 .- u[2:(end - 1), 2:(end - 1)])
+        )
+        F_v = (
+            (
+                u[2:(end - 1), 2:(end - 1)] .*
+                (v[2:(end - 1), 2:(end - 1)] .* v[2:(end - 1), 2:(end - 1)])
+            ) - (args.f + args.k) * v[2:(end - 1), 2:(end - 1)]
+        )
+        # 2-D Laplacian via slicing, excluding boundaries
+        u_lap = (
+            (
+                u[3:end, 2:(end - 1)] - 2 * u[2:(end - 1), 2:(end - 1)] +
+                u[1:(end - 2), 2:(end - 1)]
+            ) ./ args.dx^2 +
+            (
+                u[2:(end - 1), 3:end] - 2 * u[2:(end - 1), 2:(end - 1)] +
+                u[2:(end - 1), 1:(end - 2)]
+            ) ./ args.dx^2
+        )
+        v_lap = (
+            (
+                v[3:end, 2:(end - 1)] - 2 * v[2:(end - 1), 2:(end - 1)] +
+                v[1:(end - 2), 2:(end - 1)]
+            ) ./ args.dx^2 +
+            (
+                v[2:(end - 1), 3:end] - 2 * v[2:(end - 1), 2:(end - 1)] +
+                v[2:(end - 1), 1:(end - 2)]
+            ) ./ args.dx^2
+        )
+
+        # Forward-Euler step for all interior points
+        u_new[2:(end - 1), 2:(end - 1)] =
+            ((args.c_u * u_lap) + F_u) * args.dt + u[2:(end - 1), 2:(end - 1)]
+        v_new[2:(end - 1), 2:(end - 1)] =
+            ((args.c_v * v_lap) + F_v) * args.dt + v[2:(end - 1), 2:(end - 1)]
+
+        # Periodic boundary conditions
+        u_new[:, 1] = u[:, end - 1]
+        u_new[:, end] = u[:, 2]
+        u_new[1, :] = u[end - 1, :]
+        u_new[end, :] = u[2, :]
+        v_new[:, 1] = v[:, end - 1]
+        v_new[:, end] = v[:, 2]
+        v_new[1, :] = v[end - 1, :]
+        v_new[end, :] = v[2, :]
+    end
+    @eval _gs_step!(b::GrayScottBaseline, u, v, u_new, v_new, args::GSParams) = $body
+    @eval _gs_step!(b::GrayScottLifetimes, u, v, u_new, v_new, args::GSParams) = @analyze_lifetimes $body
+end
+
+function run!(b::AbstractGrayScott, st::GrayScottState)
+    _gs_step!(b, st.u, st.v, st.u_new, st.v_new, st.params)
+    # swap references rather than copy
+    st.u, st.u_new = st.u_new, st.u
+    st.v, st.v_new = st.v_new, st.v
+    return nothing
+end
+
+register_benchmark("grayscott_baseline", GrayScottBaseline)
+register_benchmark("grayscott_lifetimes", GrayScottLifetimes)
diff --git a/benchmark/src/benchmarks/montecarlo.jl b/benchmark/src/benchmarks/montecarlo.jl
new file mode 100644
index 00000000..1df91c97
--- /dev/null
+++ b/benchmark/src/benchmarks/montecarlo.jl
@@ -0,0 +1,31 @@
+Base.@kwdef struct MonteCarloIntegration{T} <: AbstractBenchmark{T}
+    n_samples::Int
+end
+
+name(::MonteCarloIntegration) = "montecarlo"
+dims(mci::MonteCarloIntegration) = (mci.n_samples, 1)
+function data(mci::MonteCarloIntegration{T}) where {T}
+    "Monte Carlo Integration with T=$(T), n_samples=$(mci.n_samples)"
+end
+
+allowed_types(::Type{MonteCarloIntegration}) = cuNumeric.SUPPORTED_FLOAT_TYPES
+
+total_space(s::MonteCarloIntegration{T}) where {T} = s.n_samples * sizeof(T)
+total_flops(s::MonteCarloIntegration) = s.n_samples
+
+function initialize(mci::MonteCarloIntegration{T}; mod=cuNumeric) where {T}
+    # Uniform samples over the integration domain [0, 10].
+    x = T(10) .* mod.rand(T, mci.n_samples)
+    GC.gc()
+    return (x,)
+end
+
+_domain_volume(mci::MonteCarloIntegration{T}) where {T} = T(10) / mci.n_samples
+run!(mci::MonteCarloIntegration, x) = _domain_volume(mci) * sum(exp.(-x .^ 2))
+
+# n_samples comes in as N; M is unused.
+function build_benchmark(::Type{MonteCarloIntegration}, ::Type{T}, N, M) where {T}
+    MonteCarloIntegration{T}(; n_samples=N)
+end
+
+register_benchmark("montecarlo", MonteCarloIntegration)
diff --git a/benchmark/src/core.jl b/benchmark/src/core.jl
new file mode 100644
index 00000000..db452c6e
--- /dev/null
+++ b/benchmark/src/core.jl
@@ -0,0 +1,130 @@
+using Printf
+using Statistics
+
+"""
+- `n_warmup::Int` : Number of warmup steps. These are not timed. Intended
+    to avoid pre-compilation cost being timed.
+- `n_iter::Int` : Number of iterations to run per trial. Should be large enough
+    to build up queue depth of tasks such that latency is hidden.
+- `n_trial::Int` : Number of independent trials to run. Timing is restarted and
+    legate in between each trial. Sets number of datapoints used to estimated
+    standard deviations/errors.
+- `n_gpu::Int` : The number of GPUs used by legate. Set through the LEGATE_CONFIG,
+    this value is just bookkeeping.
+"""
+Base.@kwdef struct GlobalSettings
+    n_warmup::Int # Number of warmup steps, where timing is not done.
+    n_iter::Int # Number of iterations to run per trial
+    n_trial::Int = 1 # Number of independent trials to run. Benchmark
+    n_gpu::Int = 0
+    cupynumeric::Bool = false # also run baselines under cupynumeric for comparison
+    cuda::Bool = false # also run under CUDA.jl for comparison (single-GPU only)
+end
+
+#########################################
+
+abstract type AbstractBenchmark{T} end
+
+# Interface each benchmark implements (see benchmarks/gemm.jl for a template).
+function name end
+function dims end
+function data end
+function allowed_types end
+function total_flops end
+function initialize end
+function run! end
+
+# Maps a benchmarks.toml table name to its benchmark type. Each benchmark file
+# registers itself via `register_benchmark`.
+const BENCHMARKS = Dict{String,Type}()
+function register_benchmark(key::AbstractString, ::Type{B}) where {B<:AbstractBenchmark}
+    BENCHMARKS[key] = B
+end
+
+function build_benchmark(::Type{B}, ::Type{T}, N, M) where {B<:AbstractBenchmark,T}
+    B{T}(; N=N, M=M)
+end
+
+#########################################
+
+# Per-trial timings for one benchmark. `times_ms[i]`/`gflops[i]` are the mean
+# over `n_iter` iterations for trial `i`; the spread across trials gives stddev.
+struct BenchmarkResult{B<:AbstractBenchmark}
+    times_ms::Vector{Float64}
+    gflops::Vector{Float64}
+    benchmark::B
+end
+
+# One timed trial: warmup, then time `n_iter` iterations of `run!`.
+function _trial(b::AbstractBenchmark, gs::GlobalSettings; mod=cuNumeric)
+    GC.gc(true)
+    state = initialize(b; mod=mod)
+
+    start_time = nothing
+    for idx in 1:(gs.n_warmup + gs.n_iter)
+        if idx == gs.n_warmup + 1
+            start_time = get_time_microseconds()
+        end
+        run!(b, state...)
+    end
+    total_time_μs = get_time_microseconds() - start_time
+
+    mean_time_ms = total_time_μs / (gs.n_iter * 1e3)
+    gflops = total_flops(b) / (mean_time_ms * 1e6)
+    return mean_time_ms, gflops
+end
+
+# Run `n_trial` independent trials and collect their per-trial measurements.
+function run_benchmark(b::AbstractBenchmark, gs::GlobalSettings; mod=cuNumeric)
+    times_ms = Float64[]
+    gflops = Float64[]
+    for _ in 1:gs.n_trial
+        t, g = _trial(b, gs; mod=mod)
+        push!(times_ms, t)
+        push!(gflops, g)
+    end
+    return BenchmarkResult(times_ms, gflops, b)
+end
+
+_std(x) = length(x) > 1 ? std(x) : 0.0
+
+function save_result(br::BenchmarkResult, gpus; mod::String="cunumeric")
+    N, M = dims(br.benchmark)
+    path = joinpath(@__DIR__, "..", "results", "$(name(br.benchmark))_$(mod).csv")
+    mkpath(dirname(path))
+    open(path, "a") do io
+        for trial in eachindex(br.times_ms)
+            @printf(
+                io, "%s,%d,%d,%d,%d,%.6f,%.6f\n",
+                mod, gpus, N, M, trial,
+                br.times_ms[trial], br.gflops[trial],
+            )
+        end
+    end
+end
+
+#########################################
+
+# `setup` runs in the worker before the benchmark is built (e.g. flip a runtime
+# preference); code-path variants leave it a no-op.
+# struct Variant
+#     name::String
+#     setup::Function
+# end
+
+# const VARIANTS = Dict{String,Variant}()
+
+# function register_variant(name, setup=() -> nothing)
+#     VARIANTS[name] = Variant(name, setup)
+# end
+
+# function variant_setup(name)
+#     if haskey(VARIANTS, name)
+#         return VARIANTS[name].setup
+#     end
+#     return () -> nothing
+# end
+
+# register_variant("baseline")
+# register_variant("fusion_off", cuNumeric.CNPreferences.disable_broadcast_fusion!)
+# register_variant("fusion_on",  cuNumeric.CNPreferences.enable_broadcast_fusion!)
diff --git a/benchmark/src/parse_benchmarks.jl b/benchmark/src/parse_benchmarks.jl
new file mode 100644
index 00000000..605c5002
--- /dev/null
+++ b/benchmark/src/parse_benchmarks.jl
@@ -0,0 +1,86 @@
+using TOML
+
+"""
+One benchmark invocation parsed from `benchmarks.toml`. `name` selects the
+benchmark type from `BENCHMARKS`; `T` is the element type (e.g. "Float32");
+`args` are the sizes (currently `N M`).
+"""
+struct BenchmarkSpec
+    name::String
+    T::String
+    gpus::Int
+    cpus::Int
+    args::Vector{Int}
+end
+
+# A field may be a scalar or a list.
+aslist(x) = x isa AbstractVector ? collect(x) : [x]
+
+# Value of a zipped field for sweep position `i`. length==1 field broadcasts.
+sweep_value(field, i) = length(field) == 1 ? field[1] : field[i]
+
+# Number of positions in the sweep. Every multi-element field must agree on length;
+# length==1 fields broadcast and don't constrain it.
+function sweep_length(name, fields)
+    lengths = [length(field) for (_, field) in fields if length(field) > 1]
+    isempty(lengths) && return 1
+    allequal(lengths) || error(
+        "benchmark '$(name)': zipped fields gpus/cpus/N/M must share one length " *
+        "or be scalar; got " * join(("$k=$(length(v))" for (k, v) in fields), ", "),
+    )
+    return first(lengths)
+end
+
+# Names of the `[[name]]` blocks in the order they appear in the file. TOML.jl
+# parses into an unordered Dict, so we scan the source to preserve run order.
+function declared_order(path)
+    order = String[]
+    for line in eachline(path)
+        header = strip(line)
+        startswith(header, "[[") && endswith(header, "]]") || continue
+        name = strip(header[3:(end - 2)])
+        name in order || push!(order, name) # if not in list, push to ordered list
+    end
+    return order
+end
+
+function parse_config(path)
+    raw = TOML.parsefile(path)
+
+    g = raw["Global"]
+    global_settings = GlobalSettings(;
+        n_warmup=g["n_warmup"], n_iter=g["n_iter"], n_trial=get(g, "n_trial", 1),
+        cupynumeric=get(g, "cupynumeric", false),
+        cuda=get(g, "cuda", false),
+    )
+
+    specs = BenchmarkSpec[]
+    for name in declared_order(path)
+        entries = raw[name]
+        for e in entries
+            types = aslist(get(e, "T", "Float32"))
+            gpus = aslist(e["gpus"])
+            cpus = aslist(e["cpus"])
+            # fusion = get(e, "fusion", true)
+            N = aslist(e["N"])
+            M = aslist(get(e, "M", 1))
+
+            n = sweep_length(name, ["gpus" => gpus, "cpus" => cpus, "N" => N, "M" => M])
+
+            for T in types, i in 1:n
+                push!(
+                    specs,
+                    BenchmarkSpec(
+                        name,
+                        T,
+                        sweep_value(gpus, i),
+                        sweep_value(cpus, i),
+                        [sweep_value(N, i), sweep_value(M, i)],
+                    ),
+                )
+            end
+        end
+    end
+
+    return global_settings, specs
+end
diff --git a/benchmark/src/single.jl b/benchmark/src/single.jl
new file mode 100644
index 00000000..5b2fff54
--- /dev/null
+++ b/benchmark/src/single.jl
@@ -0,0 +1,54 @@
+# single.jl: worker that runs exactly one benchmark under one backend. Launched by
+# run_benchmark.sh (dispatched from run.jl), which sets LEGATE_CONFIG before julia starts.
+# Args: <gpus> <name> <T> <N> <M> <n_iter> <n_warmup> <n_trial> <backend>
+# backend is "cunumeric" or "cudajl"; run.jl launches one worker per backend.
+
+using cuNumeric
+using CUDACore
+using LinearAlgebra
+
+include("core.jl")
+const BENCHMARK_DIR = joinpath(@__DIR__, "benchmarks")
+include.(filter(contains(r".jl$"), readdir(BENCHMARK_DIR; join=true)))
+
+# Resolve a TOML type string like "Float32" to the actual Julia type.
+parse_type(s) = getfield(Base, Symbol(s))::DataType
+
+# mod runs the kernels; label tags stdout; save_as names the results CSV.
+const BACKENDS = Dict(
+    "cunumeric" => (mod=cuNumeric, label="cuNumeric", save_as="cunumeric"),
+    "cudajl" => (mod=CUDACore, label="CUDA.jl", save_as="CUDA.jl"),
+)
+
+function run_single(gpus, name, T_str, N, M, n_iter, n_warmup, n_trial, backend)
+    haskey(BENCHMARKS, name) || error(
+        "No benchmark registered for '$(name)'. Known: $(join(sort(collect(keys(BENCHMARKS))), ", "))"
+    )
+    haskey(BACKENDS, backend) || error(
+        "Unknown backend '$(backend)'. Known: $(join(sort(collect(keys(BACKENDS))), ", "))"
+    )
+    bk = BACKENDS[backend]
+    T = parse_type(T_str)
+    b = build_benchmark(BENCHMARKS[name], T, N, M)
+    gs = GlobalSettings(; n_warmup=n_warmup, n_iter=n_iter, n_trial=n_trial)
+
+    println(
+        "[$(bk.label)] $(name) benchmark ($(T)) on $(N)x$(M) for $(n_iter) " *
+        "iterations ($(n_warmup) warmup) x $(n_trial) trials",
+    )
+    br = run_benchmark(b, gs; mod=bk.mod)
+    @printf("[%s] Mean Run Time: %.5f ± %.5f ms\n", bk.label, mean(br.times_ms), _std(br.times_ms))
+    @printf("[%s] FLOPS: %.5f ± %.5f GFLOPS\n", bk.label, mean(br.gflops), _std(br.gflops))
+    save_result(br, gpus; mod=bk.save_as)
+end
+
+gpus = parse(Int, ARGS[1])
+bench_name = ARGS[2]
+T_str = ARGS[3]
+N = parse(Int, ARGS[4])
+M = parse(Int, ARGS[5])
+n_iter = parse(Int, ARGS[6])
+n_warmup = parse(Int, ARGS[7])
+n_trial = parse(Int, ARGS[8])
+backend = ARGS[9]
+run_single(gpus, bench_name, T_str, N, M, n_iter, n_warmup, n_trial, backend)
diff --git a/benchmark/src_py/benchmarks/__init__.py b/benchmark/src_py/benchmarks/__init__.py
new file mode 100644
index 00000000..2eee4477
--- /dev/null
+++ b/benchmark/src_py/benchmarks/__init__.py
@@ -0,0 +1,8 @@
+import importlib
+import pkgutil
+
+from core import BENCHMARKS
+
+# Import each module so it self-registers into BENCHMARKS.
+for _info in pkgutil.iter_modules(__path__):
+    importlib.import_module(f"{__name__}.{_info.name}")
diff --git a/benchmark/src_py/benchmarks/gemm.py b/benchmark/src_py/benchmarks/gemm.py
new file mode 100644
index 00000000..b5d1a4b3
--- /dev/null
+++ b/benchmark/src_py/benchmarks/gemm.py
@@ -0,0 +1,29 @@
+import cupynumeric as np
+
+from core import register_benchmark
+
+
+class GEMM:
+    name = "gemm"
+
+    def __init__(self, T, N, M):
+        self.T, self.N, self.M = T, N, M
+
+    def dims(self):
+        return self.N, self.M
+
+    def total_flops(self):
+        return self.N * self.N * (2 * self.M - 1)
+
+    def initialize(self):
+        A = np.random.rand(self.N, self.M).astype(self.T)
+        B = np.random.rand(self.M, self.N).astype(self.T)
+        C = np.zeros((self.N, self.N), dtype=self.T)
+        return (C, A, B)
+
+    def run(self, state):
+        C, A, B = state
+        np.matmul(A, B, out=C)
+
+
+register_benchmark("gemm", GEMM)
diff --git a/benchmark/src_py/benchmarks/grayscott.py b/benchmark/src_py/benchmarks/grayscott.py
new file mode 100644
index 00000000..a1a89e73
--- /dev/null
+++ b/benchmark/src_py/benchmarks/grayscott.py
@@ -0,0 +1,71 @@
+import cupynumeric as np
+
+from core import register_benchmark
+
+
+class GrayScott:
+    name = "grayscott"
+
+    # dt = dx/5; c_u, c_v, f, k as in grayscott.jl's GSParams defaults.
+    def __init__(self, T, N, M, dx=1.0, c_u=1.0, c_v=0.3, f=0.03, k=0.06):
+        self.T, self.N, self.M = T, N, M
+        self.dx = T(dx)
+        self.dt = T(dx / 5)
+        self.c_u, self.c_v, self.f, self.k = T(c_u), T(c_v), T(f), T(k)
+
+    def dims(self):
+        return self.N, self.M
+
+    def total_flops(self):
+        return self.N * self.M
+
+    def initialize(self):
+        d = (self.N, self.M)
+        u = np.ones(d, dtype=self.T)
+        v = np.zeros(d, dtype=self.T)
+        u_new = np.zeros(d, dtype=self.T)
+        v_new = np.zeros(d, dtype=self.T)
+
+        seed = min(150, self.N, self.M)
+        u[:seed, :seed] = np.random.rand(seed, seed).astype(self.T)
+        v[:seed, :seed] = np.random.rand(seed, seed).astype(self.T)
+        # mutable list so run() can swap buffers in place
+        return [u, v, u_new, v_new]
+
+    def run(self, state):
+        u, v, u_new, v_new = state
+        ui = u[1:-1, 1:-1]
+        vi = v[1:-1, 1:-1]
+
+        F_u = (-ui * (vi * vi)) + self.f * (1 - ui)
+        F_v = (ui * (vi * vi)) - (self.f + self.k) * vi
+
+        dx2 = self.dx * self.dx
+        u_lap = (
+            (u[2:, 1:-1] - 2 * ui + u[:-2, 1:-1]) / dx2
+            + (u[1:-1, 2:] - 2 * ui + u[1:-1, :-2]) / dx2
+        )
+        v_lap = (
+            (v[2:, 1:-1] - 2 * vi + v[:-2, 1:-1]) / dx2
+            + (v[1:-1, 2:] - 2 * vi + v[1:-1, :-2]) / dx2
+        )
+
+        u_new[1:-1, 1:-1] = (self.c_u * u_lap + F_u) * self.dt + ui
+        v_new[1:-1, 1:-1] = (self.c_v * v_lap + F_v) * self.dt + vi
+
+        # periodic boundary conditions
+        u_new[:, 0] = u[:, -2]
+        u_new[:, -1] = u[:, 1]
+        u_new[0, :] = u[-2, :]
+        u_new[-1, :] = u[1, :]
+        v_new[:, 0] = v[:, -2]
+        v_new[:, -1] = v[:, 1]
+        v_new[0, :] = v[-2, :]
+        v_new[-1, :] = v[1, :]
+
+        # swap references rather than copy
+        state[0], state[2] = u_new, u
+        state[1], state[3] = v_new, v
+
+
+register_benchmark("grayscott_baseline", GrayScott)
diff --git a/benchmark/src_py/benchmarks/montecarlo.py b/benchmark/src_py/benchmarks/montecarlo.py
new file mode 100644
index 00000000..370fc7b9
--- /dev/null
+++ b/benchmark/src_py/benchmarks/montecarlo.py
@@ -0,0 +1,28 @@
+import cupynumeric as np
+
+from core import register_benchmark
+
+
+class MonteCarlo:
+    name = "montecarlo"
+
+    def __init__(self, T, N, M):
+        self.T = T
+        self.n_samples = N
+
+    def dims(self):
+        return self.n_samples, 1
+
+    def total_flops(self):
+        return self.n_samples
+
+    def initialize(self):
+        x = (self.T(10) * np.random.rand(self.n_samples)).astype(self.T)
+        return (x,)
+
+    def run(self, state):
+        (x,) = state
+        return (self.T(10) / self.n_samples) * np.sum(np.exp(-(x * x)))
+
+
+register_benchmark("montecarlo", MonteCarlo)
diff --git a/benchmark/src_py/core.py b/benchmark/src_py/core.py
new file mode 100644
index 00000000..f32a4fc3
--- /dev/null
+++ b/benchmark/src_py/core.py
@@ -0,0 +1,57 @@
+import os
+import math
+
+import cupynumeric as np
+from legate.timing import time  # blocks on preceding legate ops; returns microseconds
+
+MOD = "cupynumeric"
+RESULTS_DIR = os.path.join(os.path.dirname(__file__), "..", "results")
+
+DTYPES = {"Float32": np.float32, "Float64": np.float64}
+
+
+def parse_type(s):
+    if s not in DTYPES:
+        raise ValueError(f"Unsupported type '{s}'. Known: {', '.join(DTYPES)}")
+    return DTYPES[s]
+
+
+BENCHMARKS = {}
+
+
+def register_benchmark(key, cls):
+    BENCHMARKS[key] = cls
+
+
+def trial(bench, n_warmup, n_iter):
+    state = bench.initialize()
+    start = None
+    for idx in range(n_warmup + n_iter):
+        if idx == n_warmup:
+            start = time()
+        bench.run(state)
+    total_us = time() - start
+
+    mean_time_ms = total_us / (n_iter * 1e3)
+    gflops = bench.total_flops() / (mean_time_ms * 1e6)
+    return mean_time_ms, gflops
+
+
+def _mean(x):
+    return sum(x) / len(x)
+
+
+def _std(x):
+    if len(x) < 2:
+        return 0.0
+    m = _mean(x)
+    return math.sqrt(sum((v - m) ** 2 for v in x) / (len(x) - 1))
+
+
+def save_result(name, dims, gpus, times_ms, gflops):
+    os.makedirs(RESULTS_DIR, exist_ok=True)
+    N, M = dims
+    path = os.path.join(RESULTS_DIR, f"{name}_{MOD}.csv")
+    with open(path, "a") as io:
+        for i, (t, g) in enumerate(zip(times_ms, gflops), start=1):
+            io.write(f"{MOD},{gpus},{N},{M},{i},{t:.6f},{g:.6f}\n")
diff --git a/benchmark/src_py/single.py b/benchmark/src_py/single.py
new file mode 100644
index 00000000..005cda31
--- /dev/null
+++ b/benchmark/src_py/single.py
@@ -0,0 +1,48 @@
+# cupynumeric worker, run by run_benchmark.sh which sets LEGATE_CONFIG first.
+# Args: <gpus> <name> <T> <N> <M> <n_iter> <n_warmup> <n_trial>
+import os
+import sys
+
+# Make `core` and the `benchmarks` package importable when run as a script.
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from core import MOD, parse_type, trial, save_result, _mean, _std
+from benchmarks import BENCHMARKS  # import populates BENCHMARKS
+
+
+def main():
+    gpus = int(sys.argv[1])
+    name = sys.argv[2]
+    T_str = sys.argv[3]
+    N = int(sys.argv[4])
+    M = int(sys.argv[5])
+    n_iter = int(sys.argv[6])
+    n_warmup = int(sys.argv[7])
+    n_trial = int(sys.argv[8])
+
+    if name not in BENCHMARKS:
+        raise ValueError(
+            f"No benchmark registered for '{name}'. Known: {', '.join(sorted(BENCHMARKS))}"
+        )
+    T = parse_type(T_str)
+    bench = BENCHMARKS[name](T, N, M)
+
+    print(
+        f"[{MOD}] {name} benchmark ({T_str}) on {N}x{M} for {n_iter} "
+        f"iterations ({n_warmup} warmup) x {n_trial} trials"
+    )
+
+    times_ms, gflops = [], []
+    for _ in range(n_trial):
+        t, g = trial(bench, n_warmup, n_iter)
+        times_ms.append(t)
+        gflops.append(g)
+
+    print(f"[{MOD}] Mean Run Time: {_mean(times_ms):.5f} ± {_std(times_ms):.5f} ms")
+    print(f"[{MOD}] FLOPS: {_mean(gflops):.5f} ± {_std(gflops):.5f} GFLOPS")
+
+    save_result(bench.name, bench.dims(), gpus, times_ms, gflops)
+
+
+if __name__ == "__main__":
+    main()