381
381
# //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
382
382
# LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
383
383
# LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
384
+ # LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
384
385
#
385
386
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
386
387
# };
419
420
# LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35
420
421
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36
421
422
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37
423
+ LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38
422
424
LLAMA_FTYPE_GUESSED = 1024
423
425
424
426
# enum llama_rope_scaling_type {
@@ -691,6 +693,7 @@ class llama_model_kv_override(ctypes.Structure):
691
693
# bool use_mmap; // use mmap if possible
692
694
# bool use_mlock; // force system to keep model in RAM
693
695
# bool check_tensors; // validate model tensor data
696
+ # bool use_extra_bufts; // use extra buffer types (used for weight repacking)
694
697
# };
695
698
class llama_model_params (ctypes .Structure ):
696
699
"""Parameters for llama_model
@@ -708,7 +711,8 @@ class llama_model_params(ctypes.Structure):
708
711
vocab_only (bool): only load the vocabulary, no weights
709
712
use_mmap (bool): use mmap if possible
710
713
use_mlock (bool): force system to keep model in RAM
711
- check_tensors (bool): validate model tensor data"""
714
+ check_tensors (bool): validate model tensor data
715
+ use_extra_bufts (bool): use extra buffer types (used for weight repacking)"""
712
716
713
717
if TYPE_CHECKING :
714
718
devices : CtypesArray [ctypes .c_void_p ] # NOTE: unused
@@ -724,6 +728,7 @@ class llama_model_params(ctypes.Structure):
724
728
use_mmap : bool
725
729
use_mlock : bool
726
730
check_tensors : bool
731
+ use_extra_bufts : bool
727
732
728
733
_fields_ = [
729
734
("devices" , ctypes .c_void_p ), # NOTE: unnused
@@ -739,6 +744,7 @@ class llama_model_params(ctypes.Structure):
739
744
("use_mmap" , ctypes .c_bool ),
740
745
("use_mlock" , ctypes .c_bool ),
741
746
("check_tensors" , ctypes .c_bool ),
747
+ ("use_extra_bufts" , ctypes .c_bool ),
742
748
]
743
749
744
750
@@ -787,6 +793,9 @@ class llama_model_params(ctypes.Structure):
787
793
# bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
788
794
# // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
789
795
# // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
796
+ # bool kv_unified; // use a unified buffer across the input sequences when computing the attention
797
+ # // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
798
+ # // ref: https://github.com/ggml-org/llama.cpp/pull/14363
790
799
# };
791
800
class llama_context_params (ctypes .Structure ):
792
801
"""Parameters for llama_context
@@ -821,6 +830,7 @@ class llama_context_params(ctypes.Structure):
821
830
no_perf (bool): whether to measure performance timings
822
831
op_offload (bool): offload host tensor operations to device
823
832
swa_full (bool): use full-size SWA cache
833
+ kv_unified (bool): use a unified buffer across the input sequences when computing the attention
824
834
"""
825
835
826
836
if TYPE_CHECKING :
@@ -853,6 +863,7 @@ class llama_context_params(ctypes.Structure):
853
863
no_perf : bool
854
864
op_offload : bool
855
865
swa_full : bool
866
+ kv_unified : bool
856
867
857
868
_fields_ = [
858
869
("n_ctx" , ctypes .c_uint32 ),
@@ -884,6 +895,7 @@ class llama_context_params(ctypes.Structure):
884
895
("no_perf" , ctypes .c_bool ),
885
896
("op_offload" , ctypes .c_bool ),
886
897
("swa_full" , ctypes .c_bool ),
898
+ ("kv_unified" , ctypes .c_bool ),
887
899
]
888
900
889
901
@@ -1651,6 +1663,14 @@ def llama_model_is_recurrent(model: llama_model_p, /) -> bool:
1651
1663
...
1652
1664
1653
1665
1666
+ # // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
1667
+ # LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
1668
+ @ctypes_function ("llama_model_is_diffusion" , [llama_model_p_ctypes ], ctypes .c_bool )
1669
+ def llama_model_is_diffusion (model : llama_model_p , / ) -> bool :
1670
+ """Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)"""
1671
+ ...
1672
+
1673
+
1654
1674
# // Returns 0 on success
1655
1675
# LLAMA_API uint32_t llama_model_quantize(
1656
1676
# const char * fname_inp,
@@ -2833,6 +2853,7 @@ def llama_synchronize(ctx: llama_context_p, /):
2833
2853
# // in the order they have appeared in the batch.
2834
2854
# // Rows: number of tokens for which llama_batch.logits[i] != 0
2835
2855
# // Cols: n_vocab
2856
+ # // TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
2836
2857
# LLAMA_API float * llama_get_logits(struct llama_context * ctx);
2837
2858
@ctypes_function (
2838
2859
"llama_get_logits" , [llama_context_p_ctypes ], ctypes .POINTER (ctypes .c_float )
@@ -2873,6 +2894,7 @@ def llama_get_logits_ith(
2873
2894
# // in the order they have appeared in the batch.
2874
2895
# // shape: [n_outputs*n_embd]
2875
2896
# // Otherwise, returns NULL.
2897
+ # // TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
2876
2898
# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
2877
2899
@ctypes_function (
2878
2900
"llama_get_embeddings" , [llama_context_p_ctypes ], ctypes .POINTER (ctypes .c_float )
@@ -3020,6 +3042,13 @@ def llama_vocab_pad(vocab: llama_vocab_p, /) -> llama_token:
3020
3042
...
3021
3043
3022
3044
3045
+ # LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
3046
+ @ctypes_function ("llama_vocab_mask" , [llama_vocab_p_ctypes ], llama_token )
3047
+ def llama_vocab_mask (vocab : llama_vocab_p , / ) -> llama_token :
3048
+ """mask"""
3049
+ ...
3050
+
3051
+
3023
3052
# LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
3024
3053
@ctypes_function (
3025
3054
"llama_vocab_get_add_bos" ,
@@ -4176,6 +4205,7 @@ def llama_log_set(
4176
4205
4177
4206
# int32_t n_p_eval;
4178
4207
# int32_t n_eval;
4208
+ # int32_t n_reused; // number of times a ggml compute graph had been reused
4179
4209
# };
4180
4210
class llama_perf_context_data (ctypes .Structure ):
4181
4211
_fields_ = [
@@ -4185,6 +4215,7 @@ class llama_perf_context_data(ctypes.Structure):
4185
4215
("t_eval_ms" , ctypes .c_double ),
4186
4216
("n_p_eval" , ctypes .c_int32 ),
4187
4217
("n_eval" , ctypes .c_int32 ),
4218
+ ("n_reused" , ctypes .c_int32 ),
4188
4219
]
4189
4220
4190
4221
0 commit comments