Skip to content

Commit 4f26028

Browse files
committed
feat: Update llama.cpp
1 parent e1af05f commit 4f26028

File tree

3 files changed

+39
-2
lines changed

3 files changed

+39
-2
lines changed

llama_cpp/_internals.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,18 +287,24 @@ def pooling_type(self) -> int:
287287
return llama_cpp.llama_pooling_type(self.ctx)
288288

289289
def kv_cache_clear(self):
290+
assert self.memory is not None, "Memory is not initialized"
290291
llama_cpp.llama_memory_clear(self.memory, True)
291292

292293
def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
294+
assert self.memory is not None, "Memory is not initialized"
295+
seq_id = seq_id if seq_id >= 0 else 0
293296
llama_cpp.llama_memory_seq_rm(self.memory, seq_id, p0, p1)
294297

295298
def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
299+
assert self.memory is not None, "Memory is not initialized"
296300
llama_cpp.llama_memory_seq_cp(self.memory, seq_id_src, seq_id_dst, p0, p1)
297301

298302
def kv_cache_seq_keep(self, seq_id: int):
303+
assert self.memory is not None, "Memory is not initialized"
299304
llama_cpp.llama_memory_seq_keep(self.memory, seq_id)
300305

301306
def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int):
307+
assert self.memory is not None, "Memory is not initialized"
302308
llama_cpp.llama_memory_seq_add(self.memory, seq_id, p0, p1, shift)
303309

304310
def get_state_size(self) -> int:

llama_cpp/llama_cpp.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,7 @@
381381
# //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
382382
# LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
383383
# LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
384+
# LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
384385
#
385386
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
386387
# };
@@ -419,6 +420,7 @@
419420
# LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35
420421
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36
421422
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37
423+
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38
422424
LLAMA_FTYPE_GUESSED = 1024
423425

424426
# enum llama_rope_scaling_type {
@@ -691,6 +693,7 @@ class llama_model_kv_override(ctypes.Structure):
691693
# bool use_mmap; // use mmap if possible
692694
# bool use_mlock; // force system to keep model in RAM
693695
# bool check_tensors; // validate model tensor data
696+
# bool use_extra_bufts; // use extra buffer types (used for weight repacking)
694697
# };
695698
class llama_model_params(ctypes.Structure):
696699
"""Parameters for llama_model
@@ -708,7 +711,8 @@ class llama_model_params(ctypes.Structure):
708711
vocab_only (bool): only load the vocabulary, no weights
709712
use_mmap (bool): use mmap if possible
710713
use_mlock (bool): force system to keep model in RAM
711-
check_tensors (bool): validate model tensor data"""
714+
check_tensors (bool): validate model tensor data
715+
use_extra_bufts (bool): use extra buffer types (used for weight repacking)"""
712716

713717
if TYPE_CHECKING:
714718
devices: CtypesArray[ctypes.c_void_p] # NOTE: unused
@@ -724,6 +728,7 @@ class llama_model_params(ctypes.Structure):
724728
use_mmap: bool
725729
use_mlock: bool
726730
check_tensors: bool
731+
use_extra_bufts: bool
727732

728733
_fields_ = [
729734
("devices", ctypes.c_void_p), # NOTE: unnused
@@ -739,6 +744,7 @@ class llama_model_params(ctypes.Structure):
739744
("use_mmap", ctypes.c_bool),
740745
("use_mlock", ctypes.c_bool),
741746
("check_tensors", ctypes.c_bool),
747+
("use_extra_bufts", ctypes.c_bool),
742748
]
743749

744750

@@ -787,6 +793,9 @@ class llama_model_params(ctypes.Structure):
787793
# bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
788794
# // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
789795
# // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
796+
# bool kv_unified; // use a unified buffer across the input sequences when computing the attention
797+
# // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
798+
# // ref: https://github.com/ggml-org/llama.cpp/pull/14363
790799
# };
791800
class llama_context_params(ctypes.Structure):
792801
"""Parameters for llama_context
@@ -821,6 +830,7 @@ class llama_context_params(ctypes.Structure):
821830
no_perf (bool): whether to measure performance timings
822831
op_offload (bool): offload host tensor operations to device
823832
swa_full (bool): use full-size SWA cache
833+
kv_unified (bool): use a unified buffer across the input sequences when computing the attention
824834
"""
825835

826836
if TYPE_CHECKING:
@@ -853,6 +863,7 @@ class llama_context_params(ctypes.Structure):
853863
no_perf: bool
854864
op_offload: bool
855865
swa_full: bool
866+
kv_unified: bool
856867

857868
_fields_ = [
858869
("n_ctx", ctypes.c_uint32),
@@ -884,6 +895,7 @@ class llama_context_params(ctypes.Structure):
884895
("no_perf", ctypes.c_bool),
885896
("op_offload", ctypes.c_bool),
886897
("swa_full", ctypes.c_bool),
898+
("kv_unified", ctypes.c_bool),
887899
]
888900

889901

@@ -1651,6 +1663,14 @@ def llama_model_is_recurrent(model: llama_model_p, /) -> bool:
16511663
...
16521664

16531665

1666+
# // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
1667+
# LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
1668+
@ctypes_function("llama_model_is_diffusion", [llama_model_p_ctypes], ctypes.c_bool)
1669+
def llama_model_is_diffusion(model: llama_model_p, /) -> bool:
1670+
"""Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)"""
1671+
...
1672+
1673+
16541674
# // Returns 0 on success
16551675
# LLAMA_API uint32_t llama_model_quantize(
16561676
# const char * fname_inp,
@@ -2833,6 +2853,7 @@ def llama_synchronize(ctx: llama_context_p, /):
28332853
# // in the order they have appeared in the batch.
28342854
# // Rows: number of tokens for which llama_batch.logits[i] != 0
28352855
# // Cols: n_vocab
2856+
# // TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
28362857
# LLAMA_API float * llama_get_logits(struct llama_context * ctx);
28372858
@ctypes_function(
28382859
"llama_get_logits", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
@@ -2873,6 +2894,7 @@ def llama_get_logits_ith(
28732894
# // in the order they have appeared in the batch.
28742895
# // shape: [n_outputs*n_embd]
28752896
# // Otherwise, returns NULL.
2897+
# // TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
28762898
# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
28772899
@ctypes_function(
28782900
"llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
@@ -3020,6 +3042,13 @@ def llama_vocab_pad(vocab: llama_vocab_p, /) -> llama_token:
30203042
...
30213043

30223044

3045+
# LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
3046+
@ctypes_function("llama_vocab_mask", [llama_vocab_p_ctypes], llama_token)
3047+
def llama_vocab_mask(vocab: llama_vocab_p, /) -> llama_token:
3048+
"""mask"""
3049+
...
3050+
3051+
30233052
# LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
30243053
@ctypes_function(
30253054
"llama_vocab_get_add_bos",
@@ -4176,6 +4205,7 @@ def llama_log_set(
41764205

41774206
# int32_t n_p_eval;
41784207
# int32_t n_eval;
4208+
# int32_t n_reused; // number of times a ggml compute graph had been reused
41794209
# };
41804210
class llama_perf_context_data(ctypes.Structure):
41814211
_fields_ = [
@@ -4185,6 +4215,7 @@ class llama_perf_context_data(ctypes.Structure):
41854215
("t_eval_ms", ctypes.c_double),
41864216
("n_p_eval", ctypes.c_int32),
41874217
("n_eval", ctypes.c_int32),
4218+
("n_reused", ctypes.c_int32),
41884219
]
41894220

41904221

vendor/llama.cpp

0 commit comments

Comments
 (0)