abetlen · abetlen · Jun 7, 2026 · Jun 7, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat: update llama.cpp to ggml-org/llama.cpp@9e3b928fd
 - feat(example): add OpenAI-compatible embeddings endpoint by @abetlen in #2281
 
 ## [0.3.27]

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -949,6 +949,10 @@ class llama_sampler_seq_config(ctypes.Structure):
 #                       // ref: https://github.com/ggml-org/llama.cpp/pull/14363
 #     struct llama_sampler_seq_config * samplers;
 #     size_t                            n_samplers;
+#
+#     // a source/target/parent context
+#     // can be utilized in various ways, for example by sharing results or llama_memory between 2 contexts
+#     struct llama_context * ctx_other;
 # };
 class llama_context_params(ctypes.Structure):
     """Parameters for llama_context
@@ -989,6 +993,7 @@ class llama_context_params(ctypes.Structure):
         kv_unified (bool): use a unified buffer across the input sequences when computing the attention
         samplers (ctypes.POINTER(llama_sampler_seq_config)): backend sampler chain configuration
         n_samplers (int): number of backend sampler chain configurations
+        ctx_other (llama_context_p): source, target, or parent context
     """
 
     if TYPE_CHECKING:
@@ -1027,6 +1032,7 @@ class llama_context_params(ctypes.Structure):
         kv_unified: bool
         samplers: ctypes.POINTER(llama_sampler_seq_config)
         n_samplers: int
+        ctx_other: llama_context_p
 
     _fields_ = [
         ("n_ctx", ctypes.c_uint32),
@@ -1064,6 +1070,7 @@ class llama_context_params(ctypes.Structure):
         ("kv_unified", ctypes.c_bool),
         ("samplers", ctypes.POINTER(llama_sampler_seq_config)),
         ("n_samplers", ctypes.c_size_t),
+        ("ctx_other", llama_context_p_ctypes),
     ]
 
 

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
+1 −1		common/arg.cpp
+1 −1		common/common.cpp
+49 −40		common/sampling.cpp
+1 −1		common/sampling.h
+50 −41		common/speculative.cpp
+2 −0		conversion/__init__.py
+10 −0		conversion/gemma.py
+3 −2		conversion/mistral.py
+1 −1		convert_hf_to_gguf.py
+24 −0		gguf-py/gguf/constants.py
+8 −0		gguf-py/gguf/tensor_mapping.py
+4 −0		include/llama.h
+5 −0		src/llama-arch.cpp
+3 −0		src/llama-arch.h
+37 −18		src/llama-context.cpp
+2 −1		src/llama-context.h
+2 −0		src/llama-cparams.h
+2 −0		src/llama-ext.h
+18 −13		src/llama-graph.cpp
+1 −0		src/llama-graph.h
+4 −0		src/llama-hparams.cpp
+4 −0		src/llama-hparams.h
+2 −2		src/llama-kv-cache-dsa.cpp
+15 −3		src/llama-kv-cache-iswa.cpp
+3 −1		src/llama-kv-cache-iswa.h
+122 −21		src/llama-kv-cache.cpp
+10 −3		src/llama-kv-cache.h
+2 −0		src/llama-kv-cells.h
+2 −0		src/llama-memory-hybrid-iswa.cpp
+2 −0		src/llama-memory-hybrid.cpp
+4 −0		src/llama-memory.h
+64 −23		src/llama-model.cpp
+5 −0		src/llama-model.h
+200 −0		src/models/gemma4-assistant.cpp
+18 −4		src/models/gemma4.cpp
+13 −0		src/models/models.h
+3 −3		tests/test-llama-archs.cpp
+17 −8		tools/server/server-context.cpp
+1 −1		tools/server/server-task.cpp