From 8cf427ff87339c5fc9f58982f5bcc11f3d4dc7d1 Mon Sep 17 00:00:00 2001 From: Vishal Agarwal Date: Mon, 21 Apr 2025 12:32:45 +0530 Subject: [PATCH 1/7] add depth param --- examples/llama-bench/llama-bench.cpp | 40 +++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index cbcbfcee861ee..f2c04b8614750 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -160,6 +160,7 @@ struct cmd_params { std::vector n_prompt; std::vector n_gen; std::vector> n_pg; + std::vector n_depth; std::vector n_batch; std::vector n_ubatch; std::vector type_k; @@ -192,6 +193,7 @@ static const cmd_params cmd_params_defaults = { /* n_prompt */ { 512 }, /* n_gen */ { 128 }, /* n_pg */ {}, + /* n_depth */ { 0 }, /* n_batch */ { 2048 }, /* n_ubatch */ { 512 }, /* type_k */ { GGML_TYPE_F16 }, @@ -230,6 +232,8 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); printf(" -pg (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); + printf(" -d, --depth (default: %s)\n", + join(cmd_params_defaults.n_depth, ",").c_str()); printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); printf(" -ub, --ubatch-size (default: %s)\n", @@ -366,6 +370,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) }); + } else if (arg == "-d" || arg == "--depth") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.n_depth.insert(params.n_depth.end(), p.begin(), p.end()); } else if (arg == "-b" || arg == "--batch-size") { if (++i >= argc) { invalid_param = true; @@ -615,6 +626,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; } + if (params.n_depth.empty()) { + params.n_depth = cmd_params_defaults.n_depth; + } if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } @@ -674,6 +688,7 @@ struct cmd_params_instance { std::string model; int n_prompt; int n_gen; + int n_depth; int n_batch; int n_ubatch; ggml_type type_k; @@ -745,7 +760,7 @@ struct cmd_params_instance { llama_context_params to_llama_cparams() const { llama_context_params cparams = llama_context_default_params(); - cparams.n_ctx = n_prompt + n_gen; + cparams.n_ctx = n_prompt + n_gen + n_depth; cparams.n_batch = n_batch; cparams.n_ubatch = n_ubatch; cparams.type_k = type_k; @@ -780,6 +795,7 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & nt : params.n_threads) for (const auto & cm : params.cpu_mask) for (const auto & cs : params.cpu_strict) + for (const auto & nd : params.n_depth) for (const auto & pl : params.poll) { for (const auto & n_prompt : params.n_prompt) { if (n_prompt == 0) { @@ -789,6 +805,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .model = */ m, /* .n_prompt = */ n_prompt, /* .n_gen = */ 0, + /* .n_depth = */ nd, /* .n_batch = */ nb, /* .n_ubatch = */ nub, /* .type_k = */ tk, @@ -818,6 +835,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .model = */ m, /* .n_prompt = */ 0, /* .n_gen = */ n_gen, + /* .n_depth = */ nd, /* .n_batch = */ nb, /* .n_ubatch = */ nub, /* .type_k = */ tk, @@ -847,6 +865,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .model = */ m, /* .n_prompt = */ n_pg.first, /* .n_gen = */ n_pg.second, + /* .n_depth = */ nd, /* .n_batch = */ nb, /* .n_ubatch = */ nub, /* .type_k = */ tk, @@ -900,6 +919,7 @@ struct test { bool embeddings; int n_prompt; int n_gen; + int n_depth; std::string test_time; std::vector samples_ns; @@ -931,6 +951,7 @@ struct test { embeddings = inst.embeddings; n_prompt = inst.n_prompt; n_gen = inst.n_gen; + n_depth = inst.n_depth; // RFC 3339 date-time format time_t t = time(NULL); std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t)); @@ -1362,6 +1383,9 @@ struct markdown_printer : public printer { } else { snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen); } + if (t.n_depth > 0) { + snprintf(buf, sizeof(buf), "%s @ d%d", buf, t.n_depth); + } value = buf; } else if (field == "t/s") { snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts()); @@ -1603,6 +1627,12 @@ int main(int argc, char ** argv) { llama_attach_threadpool(ctx, threadpool, NULL); // warmup run + // if (t.n_depth > 0) { + // if (params.progress) { + // fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup depth run\n", params_idx, params_count); + // } + // test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads); + // } if (t.n_prompt > 0) { if (params.progress) { fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count); @@ -1620,6 +1650,14 @@ int main(int argc, char ** argv) { for (int i = 0; i < params.reps; i++) { llama_kv_self_clear(ctx); + if (t.n_depth > 0) { + if (params.progress) { + fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count, + i + 1, params.reps); + } + test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads); + } + uint64_t t_start = get_time_ns(); if (t.n_prompt > 0) { From 27a8248431c44424fa29125019855a8fee1d5468 Mon Sep 17 00:00:00 2001 From: Vishal Agarwal Date: Thu, 24 Apr 2025 17:54:11 +0530 Subject: [PATCH 2/7] update llama-bench README and add depth param --- examples/llama-bench/README.md | 155 +++++++++++++++++---------- examples/llama-bench/llama-bench.cpp | 22 ++-- 2 files changed, 104 insertions(+), 73 deletions(-) diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md index 6bbe4bb75fbf8..ca8a1d5276393 100644 --- a/examples/llama-bench/README.md +++ b/examples/llama-bench/README.md @@ -28,6 +28,7 @@ options: -p, --n-prompt (default: 512) -n, --n-gen (default: 128) -pg (default: ) + -d, --n-depth (default: 0) -b, --batch-size (default: 2048) -ub, --ubatch-size (default: 512) -ctk, --cache-type-k (default: f16) @@ -66,6 +67,8 @@ With the exception of `-r`, `-o` and `-v`, all options can be specified multiple Each test is repeated the number of times given by `-r`, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition. +Using the `-d ` option, each test can be run at a specified context depth, prefilling the KV cache with `` tokens. + For a description of the other options, see the [main example](../main/README.md). Note: @@ -148,6 +151,19 @@ $ ./llama-bench -ngl 10,20,30,31,32,33,34,35 | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | pp 512 | 2400.01 ± 7.72 | | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | tg 128 | 131.66 ± 0.49 | +### Different prefilled context + +``` +$ ./llama-bench.exe -d 0,512 +``` + +| model | size | params | backend | ngl | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: | +| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 | 7340.20 ± 23.45 | +| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 | 120.60 ± 0.59 | +| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 @ d512 | 6425.91 ± 18.88 | +| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 @ d512 | 116.71 ± 0.60 | + ## Output formats By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option. @@ -170,9 +186,9 @@ $ ./llama-bench -o csv ``` ```csv -build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts -"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961" -"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342" +build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts +"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434" +"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617" ``` ### JSON @@ -184,64 +200,78 @@ $ ./llama-bench -o json ```json [ { - "build_commit": "3469684", - "build_number": 1275, - "cuda": true, - "metal": false, - "gpu_blas": true, - "blas": true, - "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K", - "gpu_info": "NVIDIA GeForce RTX 3090 Ti", - "model_filename": "models/7B/ggml-model-q4_0.gguf", - "model_type": "llama 7B mostly Q4_0", - "model_size": 3825065984, - "model_n_params": 6738415616, - "n_batch": 512, - "n_threads": 16, - "f16_kv": true, + "build_commit": "8cf427ff", + "build_number": 5163, + "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", + "gpu_info": "NVIDIA GeForce RTX 4080", + "backends": "CUDA", + "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", + "model_type": "qwen2 7B Q4_K - Medium", + "model_size": 4677120000, + "model_n_params": 7615616512, + "n_batch": 2048, + "n_ubatch": 512, + "n_threads": 8, + "cpu_mask": "0x0", + "cpu_strict": false, + "poll": 50, + "type_k": "f16", + "type_v": "f16", "n_gpu_layers": 99, + "split_mode": "layer", "main_gpu": 0, - "mul_mat_q": true, + "no_kv_offload": false, + "flash_attn": false, "tensor_split": "0.00", + "use_mmap": true, + "embeddings": false, "n_prompt": 512, "n_gen": 0, - "test_time": "2023-09-23T12:09:57Z", - "avg_ns": 212365953, - "stddev_ns": 985423, - "avg_ts": 2410.974041, - "stddev_ts": 11.163766, - "samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ], - "samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ] + "n_depth": 0, + "test_time": "2025-04-24T11:58:50Z", + "avg_ns": 72135640, + "stddev_ns": 1453752, + "avg_ts": 7100.002165, + "stddev_ts": 140.341520, + "samples_ns": [ 74601900, 71632900, 71745200, 71952700, 70745500 ], + "samples_ts": [ 6863.1, 7147.55, 7136.37, 7115.79, 7237.21 ] }, { - "build_commit": "3469684", - "build_number": 1275, - "cuda": true, - "metal": false, - "gpu_blas": true, - "blas": true, - "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K", - "gpu_info": "NVIDIA GeForce RTX 3090 Ti", - "model_filename": "models/7B/ggml-model-q4_0.gguf", - "model_type": "llama 7B mostly Q4_0", - "model_size": 3825065984, - "model_n_params": 6738415616, - "n_batch": 512, - "n_threads": 16, - "f16_kv": true, + "build_commit": "8cf427ff", + "build_number": 5163, + "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", + "gpu_info": "NVIDIA GeForce RTX 4080", + "backends": "CUDA", + "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", + "model_type": "qwen2 7B Q4_K - Medium", + "model_size": 4677120000, + "model_n_params": 7615616512, + "n_batch": 2048, + "n_ubatch": 512, + "n_threads": 8, + "cpu_mask": "0x0", + "cpu_strict": false, + "poll": 50, + "type_k": "f16", + "type_v": "f16", "n_gpu_layers": 99, + "split_mode": "layer", "main_gpu": 0, - "mul_mat_q": true, + "no_kv_offload": false, + "flash_attn": false, "tensor_split": "0.00", + "use_mmap": true, + "embeddings": false, "n_prompt": 0, "n_gen": 128, - "test_time": "2023-09-23T12:09:59Z", - "avg_ns": 977425219, - "stddev_ns": 9268593, - "avg_ts": 130.965708, - "stddev_ts": 1.238924, - "samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ], - "samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ] + "n_depth": 0, + "test_time": "2025-04-24T11:58:51Z", + "avg_ns": 1076767880, + "stddev_ns": 9449585, + "avg_ts": 118.881588, + "stddev_ts": 1.041811, + "samples_ns": [ 1075361300, 1065089400, 1071761200, 1081934900, 1089692600 ], + "samples_ts": [ 119.03, 120.178, 119.43, 118.307, 117.464 ] } ] ``` @@ -254,8 +284,8 @@ $ ./llama-bench -o jsonl ``` ```json lines -{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]} -{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]} +{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]} +{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]} ``` @@ -271,25 +301,32 @@ $ ./llama-bench -o sql CREATE TABLE IF NOT EXISTS test ( build_commit TEXT, build_number INTEGER, - cuda INTEGER, - metal INTEGER, - gpu_blas INTEGER, - blas INTEGER, cpu_info TEXT, gpu_info TEXT, + backends TEXT, model_filename TEXT, model_type TEXT, model_size INTEGER, model_n_params INTEGER, n_batch INTEGER, + n_ubatch INTEGER, n_threads INTEGER, - f16_kv INTEGER, + cpu_mask TEXT, + cpu_strict INTEGER, + poll INTEGER, + type_k TEXT, + type_v TEXT, n_gpu_layers INTEGER, + split_mode TEXT, main_gpu INTEGER, - mul_mat_q INTEGER, + no_kv_offload INTEGER, + flash_attn INTEGER, tensor_split TEXT, + use_mmap INTEGER, + embeddings INTEGER, n_prompt INTEGER, n_gen INTEGER, + n_depth INTEGER, test_time TEXT, avg_ns INTEGER, stddev_ns INTEGER, @@ -297,6 +334,6 @@ CREATE TABLE IF NOT EXISTS test ( stddev_ts REAL ); -INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634'); -INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692'); +INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613'); +INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647'); ``` diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index f2c04b8614750..eb19238f72873 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -232,8 +232,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); printf(" -pg (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); - printf(" -d, --depth (default: %s)\n", - join(cmd_params_defaults.n_depth, ",").c_str()); + printf(" -d, --n-depth (default: %s)\n", join(cmd_params_defaults.n_depth, ",").c_str()); printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); printf(" -ub, --ubatch-size (default: %s)\n", @@ -370,7 +369,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) }); - } else if (arg == "-d" || arg == "--depth") { + } else if (arg == "-d" || arg == "--n-depth") { if (++i >= argc) { invalid_param = true; break; @@ -994,8 +993,8 @@ struct test { "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap", - "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", - "avg_ts", "stddev_ts", + "embeddings", "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns", + "stddev_ns", "avg_ts", "stddev_ts", }; return fields; } @@ -1005,8 +1004,8 @@ struct test { static field_type get_field_type(const std::string & field) { if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || - field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" || - field == "stddev_ns") { + field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || + field == "avg_ns" || field == "stddev_ns") { return INT; } if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || @@ -1062,6 +1061,7 @@ struct test { std::to_string(embeddings), std::to_string(n_prompt), std::to_string(n_gen), + std::to_string(n_depth), test_time, std::to_string(avg_ns()), std::to_string(stdev_ns()), @@ -1239,7 +1239,7 @@ struct markdown_printer : public printer { return 4; } if (field == "test") { - return 13; + return 15; } int width = std::max((int) field.length(), 10); @@ -1627,12 +1627,6 @@ int main(int argc, char ** argv) { llama_attach_threadpool(ctx, threadpool, NULL); // warmup run - // if (t.n_depth > 0) { - // if (params.progress) { - // fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup depth run\n", params_idx, params_count); - // } - // test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads); - // } if (t.n_prompt > 0) { if (params.progress) { fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count); From d60fdbf9decd387b71347a58c0b7944ead80548b Mon Sep 17 00:00:00 2001 From: Vishal Agarwal Date: Thu, 24 Apr 2025 20:00:43 +0530 Subject: [PATCH 3/7] llama-bench: default params for depth arg for faster execution --- examples/llama-bench/llama-bench.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index eb19238f72873..3410863c89838 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1649,7 +1649,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count, i + 1, params.reps); } - test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads); + test_prompt(ctx, t.n_depth, 512, cpu_get_num_math()); } uint64_t t_start = get_time_ns(); From 5ae962e2711cfb0bb3de3652e0753a1327b69989 Mon Sep 17 00:00:00 2001 From: Vishal Agarwal Date: Fri, 25 Apr 2025 16:35:30 +0530 Subject: [PATCH 4/7] Update examples/llama-bench/README.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Johannes Gäßler --- examples/llama-bench/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md index ca8a1d5276393..1f5e2f66200a6 100644 --- a/examples/llama-bench/README.md +++ b/examples/llama-bench/README.md @@ -154,7 +154,7 @@ $ ./llama-bench -ngl 10,20,30,31,32,33,34,35 ### Different prefilled context ``` -$ ./llama-bench.exe -d 0,512 +$ ./llama-bench -d 0,512 ``` | model | size | params | backend | ngl | test | t/s | From 179dade62be78545f51765ceb47db2446a56e79a Mon Sep 17 00:00:00 2001 From: Vishal Agarwal Date: Sat, 26 Apr 2025 02:49:15 +0530 Subject: [PATCH 5/7] fix buffer print ub --- examples/llama-bench/llama-bench.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 3410863c89838..06875f8512bdb 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1384,7 +1384,8 @@ struct markdown_printer : public printer { snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen); } if (t.n_depth > 0) { - snprintf(buf, sizeof(buf), "%s @ d%d", buf, t.n_depth); + int len = strlen(buf); + snprintf(buf + len, sizeof(buf) - len, " @ d%d", t.n_depth); } value = buf; } else if (field == "t/s") { From 1f0619d55d24fcd5e6ec8d0268a56508d4b23c8e Mon Sep 17 00:00:00 2001 From: Vishal Agarwal Date: Sat, 26 Apr 2025 08:16:11 +0530 Subject: [PATCH 6/7] use user provided args --- examples/llama-bench/llama-bench.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 06875f8512bdb..951a71d120c1c 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1650,7 +1650,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count, i + 1, params.reps); } - test_prompt(ctx, t.n_depth, 512, cpu_get_num_math()); + test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads); } uint64_t t_start = get_time_ns(); From 5f0f2019df1b761aa6d15c64826b921a6ff3ef14 Mon Sep 17 00:00:00 2001 From: Vishal Agarwal Date: Mon, 28 Apr 2025 14:36:36 +0530 Subject: [PATCH 7/7] remove extra whitespaces --- examples/llama-bench/llama-bench.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 951a71d120c1c..cf955601b00de 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -993,7 +993,7 @@ struct test { "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap", - "embeddings", "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns", + "embeddings", "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", }; return fields; @@ -1004,7 +1004,7 @@ struct test { static field_type get_field_type(const std::string & field) { if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || - field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || + field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || field == "avg_ns" || field == "stddev_ns") { return INT; }