Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
179 commits
Select commit Hold shift + click to select a range
80447f7
cuda : remove obsolete sources (ggml/1332)
ggerganov Aug 18, 2025
2ce5860
ggml-cpu: add mxfp4 VSX intrinsics for Power9+ (ppc64le) hardware (ll…
mgiessing Aug 19, 2025
02b49af
musa: handle __hgt2_mask, available starting from MUSA SDK rc4.3.0 (l…
yeahdongcn Aug 19, 2025
2572322
CANN: optimize rope operator (llama/15335)
YangShuai52 Aug 19, 2025
db1d238
opencl: mark `argsort` unsupported if cols exceed workgroup limit (ll…
lhez Aug 19, 2025
0eb2d65
musa: fix build warnings (llama/15258)
yeahdongcn Aug 20, 2025
5907ab3
vulkan: shorten pipeline name strings (llama/15431)
jeffbolznv Aug 20, 2025
316ed78
CUDA: replace GGML_CUDA_F16 with CUDA arch checks (llama/15433)
JohannesGaessler Aug 20, 2025
8f0579a
CUDA: refactor FA support/selection code (llama/15454)
JohannesGaessler Aug 20, 2025
622dec5
sched : copy only the used experts when offloading prompt processing …
slaren Aug 20, 2025
7c07784
musa: add GGML_UNUSED_VARS (llama/15446)
yeahdongcn Aug 21, 2025
c5874bc
ggml : fix condition of im2col on Metal backend (llama/15460)
ngxson Aug 21, 2025
04d0f9a
vulkan: Reuse conversion results in prealloc_y (llama/15410)
jeffbolznv Aug 21, 2025
7eebd49
vulkan: add exp operation (llama/15456)
ddwkim Aug 21, 2025
9dd5039
vulkan : support conv_2d_dw with f16 weights (llama/15392)
Acly Aug 21, 2025
554f96f
sched : fix possible use of wrong ids tensor when offloading moe prom…
slaren Aug 21, 2025
be841c3
CANN: Optimize RMS_NORM using cache (llama/15419)
noemotiovon Aug 22, 2025
380d3db
ggml-cpu: Support Q5_0 and Q5_1 on s390x (llama/15486)
taronaeo Aug 22, 2025
18ca4e8
cuda : add Pad Reflect 1D support (llama/14659)
YavorGIvanov Aug 22, 2025
d7b7498
ggml: add `conv3d` op (llama/15182)
rmatif Aug 22, 2025
bb5d7e2
ggml WebGPU: add support for quantization types (llama/15440)
reeselevine Aug 22, 2025
485c5c3
vulkan: optimize mul_mat_id loading row ids into shared memory (llama…
jeffbolznv Aug 23, 2025
5094171
vulkan : support ggml_mean (llama/15393)
Acly Aug 23, 2025
d8eb9f7
vulkan: Rewrite synchronization to allow some overlap between nodes (…
jeffbolznv Aug 23, 2025
2f6288c
vulkan: optimize rms_norm, and allow the work to spread across multip…
jeffbolznv Aug 23, 2025
b0d15e1
CUDA: fix half2 -> half conversion for HIP (llama/15529)
JohannesGaessler Aug 23, 2025
2781786
vulkan: workaround MoltenVK compile failure in multi_add (llama/15506)
jeffbolznv Aug 24, 2025
8c7872d
vulkan: enable Conv2D for Apple after MoltenVK fixed the bug (llama/1…
0cc4m Aug 24, 2025
85d4d2c
vulkan: Support FA with any multiple of 8 head sizes (llama/15537)
jeffbolznv Aug 24, 2025
ee11ed4
vulkan: apply MUL_MAT_ID subgroup optimization to non-coopmat devices…
0cc4m Aug 24, 2025
86331f7
CANN: ROPE cache sin/cos repeat (llama/15501)
noemotiovon Aug 25, 2025
54be54f
metal : add FA kernels for HS=40 (llama/15559)
ggerganov Aug 25, 2025
1e856b2
CUDA: MoE helper in device code, better tile sizes (llama/15525)
JohannesGaessler Aug 25, 2025
8851ef5
metal: fix regression when no metal devices are present (llama/15531)
booxter Aug 25, 2025
335d2a5
vulkan: fix min subgroup 16 condition for mmid subgroup optimization …
0cc4m Aug 25, 2025
582ef37
opencl: fix support ops condition for `rms_norm` (llama/15560)
lhez Aug 25, 2025
2468074
CUDA: Accelerate MXFP4 table lookup using `__byte_perm` (llama/15451)
Qeeweew Aug 25, 2025
79e2bd5
vulkan: Remove splitting for mul_mat_id (llama/15568)
jeffbolznv Aug 26, 2025
9828caa
Add a warning for special devices (llama/15563)
pt13762104 Aug 26, 2025
3bb52ac
metal : remove contiguous assertion for src0 in IM2COL (llama/15577)
CISC Aug 26, 2025
dc693ca
metal : improve `MUL_MAT_ID` (llama/15541)
ggerganov Aug 26, 2025
1c21a85
metal : optimize FA vec for large sequences and BS <= 8 (llama/15566)
ggerganov Aug 26, 2025
5301019
CUDA: return -1 for nonexistent compiled arch (llama/15587)
JohannesGaessler Aug 26, 2025
31c7784
llamafile: PowerPC Sgemm Optimization (llama/15558)
shalinib-ibm Aug 26, 2025
94fa9f6
SYCL: fix rms_norm_mul_add for tensor dim not a multiple of sg_size (…
qnixsynapse Aug 26, 2025
a6ec224
OpenCL: add fused group_norm/norm, mul, add (llama/15314)
rmatif Aug 27, 2025
ece1bdf
ggml-cpu : add basic RVV support for vector f32 ops (llama/15057)
xctan Aug 27, 2025
02e8b23
CANN: refactor mask handling and improve performance in FA (llama/15561)
noemotiovon Aug 27, 2025
65fa2c0
HIP: Enable support for ggml_backend_cuda_register_host_buffer (llama…
IMbackK Aug 27, 2025
88c0582
cuda: Add cublasLt_static linking when GGML_STATIC is enabled (llama/…
matiaslin Aug 28, 2025
cac6253
kv-cache : remove LLAMA_SET_ROWS checks (llama/15505)
ggerganov Aug 28, 2025
6dffbaa
ggml : fix SSM_SCAN for n_groups > 1 (llama/15625)
compilade Aug 28, 2025
6287027
ggml-cpu: fix invalid hsum build in debug s390x (llama/15634)
taronaeo Aug 28, 2025
dc9f55b
CUDA: add conv2d (llama/15635)
mnehete32 Aug 28, 2025
6d7ddaf
CUDA: fuse adds, fuse add with rms norm (llama/15631)
am17an Aug 29, 2025
82ce91e
CUDA: fix bug in rms_norm fusion (llama/15660)
am17an Aug 29, 2025
d629af1
CANN: FIx compiler warnings (llama/15661)
noemotiovon Aug 30, 2025
a6dec4f
vulkan: Skip syncing for prealloc_y when it is reused (llama/15544)
jeffbolznv Aug 30, 2025
b7809c4
CUDA: use FP32 arithmetic for conv2d (llama/15683)
JohannesGaessler Aug 30, 2025
f6ba394
llama: use FA + max. GPU layers by default (llama/15434)
JohannesGaessler Aug 30, 2025
7458384
ggml: update kleidiai to v1.13.0 (llama/15663)
chaxu01 Aug 30, 2025
71f0ee7
vulkan: clamp matmul and FA results to the max finite value (llama/15…
jeffbolznv Aug 31, 2025
20ce6fc
vulkan: Allow fallback to sysmem memory when vidmem is full (llama/15…
jeffbolznv Aug 31, 2025
b092e95
vulkan : remove unused portability_enumeration_ext variable (llama/15…
danbev Aug 31, 2025
191def7
vulkan: mul_mat_id coopmat2 optimizations (llama/15546)
jeffbolznv Aug 31, 2025
db7ecfb
vulkan: handle large sizes for get_rows (llama/15686)
jeffbolznv Aug 31, 2025
b11c972
llama : separate compute buffer reserve from fattn check (llama/15696)
slaren Aug 31, 2025
3d47068
metal : fix checks for available FA kernels (llama/15700)
ggerganov Aug 31, 2025
ed7ebdc
CANN: fix RoPE cache issue on multi-device (llama/15629)
hipudding Sep 1, 2025
bb5f844
CANN: Optimize MUL_MAT_ID (llama/15658)
hipudding Sep 1, 2025
2ba5e0c
CUDA: fix build error from ambiguous __half conversions in conv2d (ll…
qnixsynapse Sep 1, 2025
c5f511e
ggml : WebGPU add TRANSPOSE and RESHAPE to supported ops (llama/15695)
danbev Sep 1, 2025
5e70d90
Vulkan: Add Integer Dot Product mul_mat_vec shader for legacy quants …
0cc4m Sep 1, 2025
31840a3
ggml: aarch64: Implement SVE F16 kernels for vector functions (llama/…
Vithulep Sep 1, 2025
8218dc6
ggml: SVE support for exponential functions (llama/15145)
s-goto-11 Sep 1, 2025
d5f80a2
vulkan: disable large mmv subgroups on older Nvidia GPUs (llama/15717)
0cc4m Sep 1, 2025
7a5e736
vulkan: add missing clamps in new mul_mat_id paths (llama/15702)
jeffbolznv Sep 1, 2025
9e3600e
vulkan: use memory budget extension to read memory usage (llama/15545)
giladgd Sep 1, 2025
f20a7b0
ggml-backend: raise GGML_MAX_SPLIT_INPUTS (llama/15722)
JohannesGaessler Sep 1, 2025
13d3963
CANN: Support ext_factor in rope (llama/15710)
hipudding Sep 2, 2025
3db49c1
CANN: Support eager execution mode under ACL graph compilation (llama…
noemotiovon Sep 2, 2025
fb37f91
opencl: add attn sinks support for FA kernels (llama/15706)
rmatif Sep 2, 2025
1e03aa6
vulkan: Fix macro parameter order for f32 matmul shaders (llama/15716)
jeffbolznv Sep 2, 2025
5aee53c
CANN: Resolve soft_max precision issue (llama/15730)
hipudding Sep 2, 2025
e584edb
vulkan: fix shaders gen when no integer dot is available (llama/15740)
0cc4m Sep 2, 2025
d84b96d
CANN: Fix type float_t to float (llama/15736)
noemotiovon Sep 3, 2025
91e9e72
CANN: Mask unsupported TRANSPOSE_1D operator (llama/15733)
hipudding Sep 3, 2025
75f739c
ggml-cpu : optimize RVV kernels (llama/15720)
xctan Sep 3, 2025
51bc843
CANN: Add RoPE contiguous check for 310I DUP device (llama/15735)
hipudding Sep 3, 2025
9eef377
CUDA: Optimize `rms_norm_f32` kernel and its fused variants, giving 1…
ORippler Sep 3, 2025
85c7aa3
ggml vulkan: add hardsigmoid and hardswish operations (llama/15762)
relent95 Sep 3, 2025
4144ae1
vulkan : update ggml_vk_instance_validation_ext_available (llama/15666)
danbev Sep 3, 2025
4a702a8
vulkan: don't use std::string in load_shaders, to improve compile tim…
jeffbolznv Sep 3, 2025
719a05c
vulkan: fix mmv subgroup16 selection (llama/15775)
0cc4m Sep 3, 2025
5c860e9
CANN: fix acl_rstd allocation size in ggml_cann_rms_norm (llama/15760)
noemotiovon Sep 4, 2025
1569daf
opencl: add hs=40 to FA (llama/15758)
rmatif Sep 4, 2025
96efb47
CANN: Fix precision issue on 310I DUO multi-devices (llama/15784)
hipudding Sep 4, 2025
2228462
ggml: add ops for WAN video model (cuda && cpu) (llama/15669)
leejet Sep 4, 2025
3780a3c
CANN: Refactor ND to NZ workspace to be per-device (llama/15763)
noemotiovon Sep 4, 2025
ffe560c
metal : Add template specialization for mul_mm_id w/ ne20 == 10 (llam…
gabe-l-hart Sep 4, 2025
c80f78c
CUDA : conditionally add cuda architectures (ggml/1341)
gjasny Sep 10, 2025
4d6e114
ggml : introduce semantic versioning (ggml/1336)
danbev Sep 16, 2025
6ff468c
CUDA: fastdiv, launch bounds for mmvq + q8_1 quant (llama/15802)
JohannesGaessler Sep 5, 2025
f499271
ggml-cpu: drop support for nnpa intrinsics (llama/15821)
taronaeo Sep 6, 2025
69400f1
ggml-cpu: document use of "free" memory [no ci] (llama/15834)
JohannesGaessler Sep 6, 2025
be2676b
kleidiai: generalize compute_forward_kv_cache to compute_forward_fp16…
chaxu01 Sep 6, 2025
cd70d89
CUDA: faster tile FA (Pascal/AMD), headsize 256 (llama/15769)
JohannesGaessler Sep 6, 2025
cda7d4e
ggml WebGPU: remove userdata from request adapter callback (llama/15527)
danbev Sep 7, 2025
647e2d7
vulkan: Use larger loads in scalar/coopmat1 matmul (llama/15729)
jeffbolznv Sep 7, 2025
9523fd8
vulkan: Support pad_ext (llama/15794)
jeffbolznv Sep 7, 2025
db4f504
ggml-cpu: clean up s390x SIMD (llama/15855)
taronaeo Sep 7, 2025
dfa7722
vulkan: support im2col_3d (llama/15795)
jeffbolznv Sep 7, 2025
d9c0ead
CANN: Stream sync between devices for acl_graph (llama/15809)
noemotiovon Sep 8, 2025
0175a1d
CUDA: non-contiguous src0 not supported for PAD (llama/15869)
CISC Sep 8, 2025
40bcd1a
ggml: allow casting between f32 and i32 (llama/15783)
ngxson Sep 8, 2025
e9cb59e
metal : refactor + optimize (llama/15857)
ggerganov Sep 20, 2025
ae6cc6a
cuda : fix supports_op condition for get_rows when number of blocks i…
ggerganov Sep 8, 2025
70ee808
CUDA: generate_cu_files.py - add missing mxfp4 (llama/15880)
am17an Sep 8, 2025
c29cd54
vulkan: sort graph to allow more parallel execution (llama/15850)
jeffbolznv Sep 8, 2025
2609822
CUDA: fix GET_ROWS for large tensors (llama/15882)
JohannesGaessler Sep 9, 2025
621764b
CUDA: Add mul_mat_id support for the mmf kernel (llama/15767)
am17an Sep 9, 2025
7fbbb67
Workaround for subgroup arithmetic failing on MoltenVK with AMD GPUs …
lksj92hs Sep 9, 2025
e35d137
HIP: use v_dot2_f32_f16 instruction for FA (llama/15884)
JohannesGaessler Sep 9, 2025
d0e9865
vulkan: Fix OOB accesses in soft_max_back (llama/15861)
jeffbolznv Sep 9, 2025
7abe187
vulkan: throw the oom error instead of no memory type found (llama/15…
0cc4m Sep 9, 2025
9b773ac
CANN: implement LRU cache for ACL graphs (llama/15814)
noemotiovon Sep 10, 2025
4d453b1
CANN: Add ROPE sin/cos cache for reuse (llama/15912)
noemotiovon Sep 10, 2025
e2c7f1c
sync : ggml
ggerganov Sep 20, 2025
7eae055
metal : make the backend async (llama/15906)
ggerganov Sep 20, 2025
c974f63
sync : ggml
ggerganov Sep 20, 2025
3617008
ggml-cpu : fix padding in ggml_timestep_embedding (llama/15917)
danbev Sep 10, 2025
f5ef0e2
CUDA: Add `fastdiv` to `k_bin_bcast*`, giving 1-3% E2E performance (l…
ORippler Sep 10, 2025
dadf736
CANN: Disable acl_graph for prefill stage (llama/15933)
hipudding Sep 11, 2025
b079d9c
kleidiai: fix GGML_ASSERT(*cur_backend_id != -1) failed (llama/15614)
chaxu01 Sep 11, 2025
020eb19
ggml-cpu : add check for ARM MATMUL_INT8/i8mm support (llama/15922)
danbev Sep 11, 2025
f0768eb
CUDA: larger SRAM reads for tile FA, AMD FP16 dot (llama/15927)
JohannesGaessler Sep 11, 2025
555dcb3
ggml-backend : add GGML_BACKEND_DEVICE_TYPE_IGPU device type (llama/1…
slaren Sep 11, 2025
cd764ea
Revert "sycl: add usage of enqueue_functions extension (llama/14244)"…
NeoZhangJianyu Sep 12, 2025
5a752ba
vulkan: Make device memory check more portable (llama/15939)
mbaudier Sep 12, 2025
424c85f
Vulkan iGPU device selection overhaul and PCI ID API support (llama/1…
0cc4m Sep 12, 2025
e902731
ggml-zdnn: fix #15414, activate FP16 and BF16 acceleration and incorr…
taronaeo Sep 12, 2025
20a930e
metal : fix memory leaks (llama/15962)
ggerganov Sep 13, 2025
0d36ba9
metal : allow ops to run concurrently (llama/15929)
ggerganov Sep 13, 2025
2caf15d
metal : refactor kernel loading (llama/15964)
ggerganov Sep 13, 2025
a3defb0
vulkan: initialize vulkan-hpp to allow using extension function point…
jeffbolznv Sep 13, 2025
1789ed3
vulkan: fix failing dequant shaders (llama/15862)
jeffbolznv Sep 13, 2025
7dca05c
ggml-zdnn: rm user mapped buffers (llama/15965)
taronaeo Sep 14, 2025
2d3f156
metal : fix kernel requirements (llama/15983)
ggerganov Sep 14, 2025
c36358c
Vulkan: Clean up mul_mm shader (llama/15987)
0cc4m Sep 14, 2025
82a8c14
metal : remove memory pools (llama/15966)
ggerganov Sep 14, 2025
10bd5d3
CUDA: some micro-optimizations in mmf.cuh for mul_mat_id (llama/15926)
am17an Sep 15, 2025
a642b53
SYCL: Add COUNT_EQUAL operator support (llama/15991)
yael-works Sep 15, 2025
f72ec18
CUDA: fix im2col_3d to respect non-contiguous inputs (views) (llama/1…
jakekarnes42 Sep 15, 2025
5c524bb
ggml : fix padding in timestep embedding kernels (llama/15932)
danbev Sep 16, 2025
e32c3b0
CANN: Optimize ggml_cann_set_device (llama/15935)
noemotiovon Sep 17, 2025
e96b285
vulkan: automatically remove unsupported devices (llama/15976)
netrunnereve Sep 17, 2025
d452f0c
CUDA: fix FA occupancy, optimize tile kernel (llama/15982)
JohannesGaessler Sep 17, 2025
6458bac
sync : ggml
ggerganov Sep 20, 2025
eb2c01f
metal : refactor + optimize v2 (llama/15995)
ggerganov Sep 20, 2025
1361f67
GGML WebGPU: Support for ADD, MUL, RMS_NORM, GET_ROWS operators (llam…
reeselevine Sep 17, 2025
c46adc0
CANN: Remove print (llama/16044)
noemotiovon Sep 18, 2025
1f24b1d
metal : handle nil cv during pipeline creation (llama/16065)
ggerganov Sep 18, 2025
32b6d9c
metal : avoid call free for non-owned buffer (llama/16067)
jhen0409 Sep 18, 2025
d37f590
metal : improve F32, F16 and BF16 mat-vec multiplication (llama/16057)
ggerganov Sep 18, 2025
225d7c1
cuda : add missing F32<->I32 entries in ggml_cuda_cpy_fn (llama/16060)
CISC Sep 18, 2025
960aaa9
metal : use function constants for mul_mv_ext kernels (llama/16074)
ggerganov Sep 18, 2025
05bdfd4
CUDA: fix compilation on CC 6.0 (llama/16091)
JohannesGaessler Sep 18, 2025
fce6354
CUDA: Optimize PAD_REFLECT_1D (llama/15957)
bugparty Sep 18, 2025
7fcb7e8
rename optimize_graph to graph_optimize (llama/16082)
jeffbolznv Sep 18, 2025
f4a225c
opencl: optimize mxfp4 kernels (llama/16037)
shawngu-quic Sep 18, 2025
4575f96
cmake : fix static linking for OpenMP on Unix-like systems (llama/16031)
angt Sep 18, 2025
4d8cd07
ggml-amx : fix ggml_amx_init() on generic Linux (llama/16049)
angt Sep 18, 2025
2ad00d5
ggml : refactor forward_dup for cpu backend (llama/16062)
ngxson Sep 19, 2025
76d0934
vulkan: use vec dot for matrix matrix multiplications (llama/16056)
0cc4m Sep 20, 2025
66ad624
sync : ggml
ggerganov Sep 20, 2025
36778bd
talk-llama : sync llama.cpp
ggerganov Sep 20, 2025
01419ae
metal : restore im2col performance (#0)
ggerganov Sep 20, 2025
e042cc0
metal : fuse NORM + MUL + ADD (#0)
ggerganov Sep 20, 2025
aa10f77
metal : enable reordering of CPY and SET_ROWS (#0)
ggerganov Sep 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions examples/talk-llama/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ if (WHISPER_SDL2)
llama-hparams.cpp
llama-impl.cpp
llama-io.cpp
llama-kv-cache-unified.cpp
llama-kv-cache-unified-iswa.cpp
llama-kv-cache.cpp
llama-kv-cache-iswa.cpp
llama-memory-recurrent.cpp
llama-memory-hybrid.cpp
llama-memory.cpp
Expand Down
105 changes: 101 additions & 4 deletions examples/talk-llama/llama-adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include <map>
#include <cassert>
#include <sstream>
#include <stdexcept>

// vec
Expand Down Expand Up @@ -163,13 +164,38 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_

// check metadata
{
const gguf_context * gguf_ctx = ctx_gguf.get();

LLAMA_LOG_INFO("%s: Dumping metadata keys/values.\n", __func__);

// get metadata as string
for (int i = 0; i < gguf_get_n_kv(gguf_ctx); i++) {
gguf_type type = gguf_get_kv_type(gguf_ctx, i);
const std::string type_name =
type == GGUF_TYPE_ARRAY
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(gguf_ctx, i)), gguf_get_arr_n(gguf_ctx, i))
: gguf_type_name(type);
const char * name = gguf_get_key(gguf_ctx, i);
const std::string value = gguf_kv_to_str(gguf_ctx, i);

if (type != GGUF_TYPE_ARRAY) {
adapter.gguf_kv.emplace(name, value);
}

const size_t MAX_VALUE_LEN = 40;
std::string print_value = value.size() > MAX_VALUE_LEN ? format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()) : value;
replace_all(print_value, "\n", "\\n");

LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), print_value.c_str());
}

auto get_kv_str = [&](const std::string & key) -> std::string {
int id = gguf_find_key(ctx_gguf.get(), key.c_str());
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id));
int id = gguf_find_key(gguf_ctx, key.c_str());
return id < 0 ? "" : std::string(gguf_get_val_str(gguf_ctx, id));
};
auto get_kv_f32 = [&](const std::string & key) -> float {
int id = gguf_find_key(ctx_gguf.get(), key.c_str());
return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id);
int id = gguf_find_key(gguf_ctx, key.c_str());
return id < 0 ? 0.0f : gguf_get_val_f32(gguf_ctx, id);
};
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);

Expand All @@ -190,6 +216,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
}

adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));

// parse alora invocation sequence vector
const auto & key = llm_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS);
const int kid = gguf_find_key(ctx_gguf.get(), key.c_str());
if (kid >= 0) {
if (gguf_get_kv_type(ctx_gguf.get(), kid) != GGUF_TYPE_ARRAY) {
throw std::runtime_error("invalid gguf type for " + key);
}
const auto arr_type = gguf_get_arr_type(ctx_gguf.get(), kid);
if (arr_type != GGUF_TYPE_UINT32) {
throw std::runtime_error("invalid gguf element type for " + key);
}
const size_t seq_len = gguf_get_arr_n(ctx_gguf.get(), kid);
const void * data = gguf_get_arr_data(ctx_gguf.get(), kid);
adapter.alora_invocation_tokens.resize(seq_len);
std::copy(
(const llama_token *)data,
(const llama_token *)data + seq_len,
adapter.alora_invocation_tokens.begin());
}
}

int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
Expand Down Expand Up @@ -383,6 +429,57 @@ llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * p
return nullptr;
}

int32_t llama_adapter_meta_val_str(const llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size) {
const auto & it = adapter->gguf_kv.find(key);
if (it == adapter->gguf_kv.end()) {
if (buf_size > 0) {
buf[0] = '\0';
}
return -1;
}
return snprintf(buf, buf_size, "%s", it->second.c_str());
}

int32_t llama_adapter_meta_count(const llama_adapter_lora * adapter) {
return (int)adapter->gguf_kv.size();
}

int32_t llama_adapter_meta_key_by_index(const llama_adapter_lora * adapter, int i, char * buf, size_t buf_size) {
if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
if (buf_size > 0) {
buf[0] = '\0';
}
return -1;
}
auto it = adapter->gguf_kv.begin();
std::advance(it, i);
return snprintf(buf, buf_size, "%s", it->first.c_str());
}

int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size) {
if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
if (buf_size > 0) {
buf[0] = '\0';
}
return -1;
}
auto it = adapter->gguf_kv.begin();
std::advance(it, i);
return snprintf(buf, buf_size, "%s", it->second.c_str());
}

void llama_adapter_lora_free(llama_adapter_lora * adapter) {
delete adapter;
}

uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
if (!adapter) {
return 0;
}
return adapter->alora_invocation_tokens.size();
}

const llama_token * llama_adapter_get_alora_invocation_tokens(const llama_adapter_lora * adapter) {
GGML_ASSERT(adapter);
return adapter->alora_invocation_tokens.data();
}
6 changes: 6 additions & 0 deletions examples/talk-llama/llama-adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@ struct llama_adapter_lora {

float alpha;

// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;

// activated lora (aLoRA)
std::vector<llama_token> alora_invocation_tokens;

llama_adapter_lora() = default;
~llama_adapter_lora() = default;

Expand Down
Loading
Loading