From a449adcd439b23339b96f30f7547e6ffb2957aff Mon Sep 17 00:00:00 2001
From: "tattn (Tatsuya Tanaka)" <contact@tattn.dev>
Date: Sat, 25 Apr 2026 12:32:44 +0900
Subject: [PATCH 1/3] Support Gemma 4

- Added symlinks for new headers: unicode.h, mtmd-debug.h, jinja, nlohmann, and module.modulemap.
- Updated llama.cpp submodule to a new commit.
- Enhanced utils.h and utils.cpp with new functions for handling chat parameters and tool inputs.
- Removed obsolete minja symlink.
- Introduced new mtmd-image files for improved image handling.
- Updated Context.swift and Utility.swift for better integration with MLX.
- Modified glob patterns in Globs.swift to include new jinja files.
- Removed outdated LlamaToolCallParserTests and refactored LlamaToolTests for improved functionality.
- Added new tests for message processing and tool call parsing.
- Updated update_dependencies.sh to verify symlink integrity after submodule updates.
---
 .github/workflows/docc.yml                    |   8 +-
 .github/workflows/test.yml                    |   8 +-
 .github/workflows/update-dependencies.yml     |   4 +-
 Example/LocalLLMClientExample/AI.swift        |  29 +-
 .../LocalLLMClientExample/ChatViewModel.swift |   7 +
 .../LocalLLMClientExample/Downloader.swift    |   4 +-
 Package.resolved                              |  17 +-
 Package.swift                                 |  40 +-
 Sources/LocalLLMClientLlama/LlamaClient.swift |  31 +-
 .../LlamaToolCallParser.swift                 |  44 +-
 .../MessageProcessing/MessageProcessor.swift  |  16 +-
 Sources/LocalLLMClientLlama/Model.swift       |  47 +-
 Sources/LocalLLMClientLlama/Multimodal.swift  |   3 +-
 Sources/LocalLLMClientLlamaC/clip-graph.h     |   1 +
 Sources/LocalLLMClientLlamaC/clip-model.h     |   1 +
 .../common/build-info.cpp                     |  14 +
 .../LocalLLMClientLlamaC/common/build-info.h  |   1 +
 .../common/chat-auto-parser-generator.cpp     |   1 +
 .../common/chat-auto-parser-helpers.cpp       |   1 +
 .../common/chat-auto-parser-helpers.h         |   1 +
 .../common/chat-auto-parser.h                 |   1 +
 .../common/chat-diff-analyzer.cpp             |   1 +
 .../common/chat-parser.cpp                    |   1 -
 .../LocalLLMClientLlamaC/common/chat-parser.h |   1 -
 .../common/chat-peg-parser.cpp                |   1 +
 .../common/chat-peg-parser.h                  |   1 +
 Sources/LocalLLMClientLlamaC/common/log.cpp   |   1 +
 Sources/LocalLLMClientLlamaC/common/log.h     |   1 +
 .../common/peg-parser.cpp                     |   1 +
 .../LocalLLMClientLlamaC/common/peg-parser.h  |   1 +
 .../common/reasoning-budget.cpp               |   1 +
 .../common/reasoning-budget.h                 |   1 +
 .../LocalLLMClientLlamaC/common/sampling.cpp  |   1 +
 .../LocalLLMClientLlamaC/common/sampling.h    |   1 +
 .../LocalLLMClientLlamaC/common/unicode.cpp   |   1 +
 Sources/LocalLLMClientLlamaC/common/unicode.h |   1 +
 .../LocalLLMClientLlamaC/debug/mtmd-debug.h   |   1 +
 .../LocalLLMClientLlamaC/exclude/llama.cpp    |   2 +-
 Sources/LocalLLMClientLlamaC/include/jinja    |   1 +
 .../include/module.modulemap                  |  15 +
 Sources/LocalLLMClientLlamaC/include/nlohmann |   1 +
 Sources/LocalLLMClientLlamaC/include/utils.h  |  20 +
 Sources/LocalLLMClientLlamaC/minja            |   1 -
 Sources/LocalLLMClientLlamaC/models           |   1 +
 Sources/LocalLLMClientLlamaC/mtmd-image.cpp   |   1 +
 Sources/LocalLLMClientLlamaC/mtmd-image.h     |   1 +
 Sources/LocalLLMClientLlamaC/utils.cpp        |  52 ++
 Sources/LocalLLMClientMLX/Context.swift       |  22 +-
 Sources/LocalLLMClientMLX/Utility.swift       |   2 +-
 Sources/LocalLLMClientUtility/Globs.swift     |   6 +-
 .../LlamaToolCallParserTests.swift            | 146 ------
 .../LocalLLMClientLlamaToolTests.swift        | 474 ++++--------------
 .../MessageProcessorTests.swift               |  16 +
 .../LocalLLMClientLlamaTests/ModelTests.swift |   6 +-
 .../FilesMetadataTests.swift                  |   3 +-
 scripts/update_dependencies.sh                |  27 +-
 56 files changed, 432 insertions(+), 660 deletions(-)
 create mode 120000 Sources/LocalLLMClientLlamaC/clip-graph.h
 create mode 120000 Sources/LocalLLMClientLlamaC/clip-model.h
 create mode 100644 Sources/LocalLLMClientLlamaC/common/build-info.cpp
 create mode 120000 Sources/LocalLLMClientLlamaC/common/build-info.h
 create mode 120000 Sources/LocalLLMClientLlamaC/common/chat-auto-parser-generator.cpp
 create mode 120000 Sources/LocalLLMClientLlamaC/common/chat-auto-parser-helpers.cpp
 create mode 120000 Sources/LocalLLMClientLlamaC/common/chat-auto-parser-helpers.h
 create mode 120000 Sources/LocalLLMClientLlamaC/common/chat-auto-parser.h
 create mode 120000 Sources/LocalLLMClientLlamaC/common/chat-diff-analyzer.cpp
 delete mode 120000 Sources/LocalLLMClientLlamaC/common/chat-parser.cpp
 delete mode 120000 Sources/LocalLLMClientLlamaC/common/chat-parser.h
 create mode 120000 Sources/LocalLLMClientLlamaC/common/chat-peg-parser.cpp
 create mode 120000 Sources/LocalLLMClientLlamaC/common/chat-peg-parser.h
 create mode 120000 Sources/LocalLLMClientLlamaC/common/peg-parser.cpp
 create mode 120000 Sources/LocalLLMClientLlamaC/common/peg-parser.h
 create mode 120000 Sources/LocalLLMClientLlamaC/common/reasoning-budget.cpp
 create mode 120000 Sources/LocalLLMClientLlamaC/common/reasoning-budget.h
 create mode 120000 Sources/LocalLLMClientLlamaC/common/sampling.cpp
 create mode 120000 Sources/LocalLLMClientLlamaC/common/sampling.h
 create mode 120000 Sources/LocalLLMClientLlamaC/common/unicode.cpp
 create mode 120000 Sources/LocalLLMClientLlamaC/common/unicode.h
 create mode 120000 Sources/LocalLLMClientLlamaC/debug/mtmd-debug.h
 create mode 120000 Sources/LocalLLMClientLlamaC/include/jinja
 create mode 100644 Sources/LocalLLMClientLlamaC/include/module.modulemap
 create mode 120000 Sources/LocalLLMClientLlamaC/include/nlohmann
 delete mode 120000 Sources/LocalLLMClientLlamaC/minja
 create mode 120000 Sources/LocalLLMClientLlamaC/models
 create mode 120000 Sources/LocalLLMClientLlamaC/mtmd-image.cpp
 create mode 120000 Sources/LocalLLMClientLlamaC/mtmd-image.h
 delete mode 100644 Tests/LocalLLMClientLlamaTests/LlamaToolCallParserTests.swift

diff --git a/.github/workflows/docc.yml b/.github/workflows/docc.yml
index 4775469..c47e6c5 100644
--- a/.github/workflows/docc.yml
+++ b/.github/workflows/docc.yml
@@ -16,9 +16,9 @@ concurrency:
 
 jobs:
   generate-docc:
-    runs-on: macos-15
+    runs-on: macos-26
     env:
-      DEVELOPER_DIR: "/Applications/Xcode_16.4.app/Contents/Developer"
+      DEVELOPER_DIR: "/Applications/Xcode_26.4.app/Contents/Developer"
     steps:
       - uses: actions/checkout@v4
         with:
@@ -27,6 +27,10 @@ jobs:
       - name: Setup Pages
         uses: actions/configure-pages@v4
 
+      - name: Download Metal Toolchain
+        continue-on-error: true
+        run: xcodebuild -downloadComponent MetalToolchain
+
       - name: Build DocC
         # NOTE: LocalLLMClientMLX documentation is excluded because mlx-swift
         # symbol extraction requires Metal GPU support which is not available
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index cd2725b..2e11da1 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -35,7 +35,7 @@ jobs:
         # To enable MLX tests, use self-hosted runners: runs-on: [self-hosted, macos]
         test-type: [Llama, FoundationModels]
     env:
-      DEVELOPER_DIR: "/Applications/Xcode_26.2.app/Contents/Developer"
+      DEVELOPER_DIR: "/Applications/Xcode_26.4.app/Contents/Developer"
       TEST_RUNNER_GITHUB_MODEL_CACHE: "${{ github.workspace }}/model_cache"
     steps:
       - &checkout
@@ -74,7 +74,7 @@ jobs:
     runs-on: macos-26
     needs: test-macos
     env:
-      DEVELOPER_DIR: "/Applications/Xcode_26.2.app/Contents/Developer"
+      DEVELOPER_DIR: "/Applications/Xcode_26.4.app/Contents/Developer"
     steps:
       - *checkout
 
@@ -90,7 +90,7 @@ jobs:
     runs-on: macos-26
     needs: test-macos
     env:
-      DEVELOPER_DIR: "/Applications/Xcode_26.2.app/Contents/Developer"
+      DEVELOPER_DIR: "/Applications/Xcode_26.4.app/Contents/Developer"
     steps:
       - *checkout
 
@@ -100,7 +100,7 @@ jobs:
         working-directory: Example
         run: |
           xcodebuild -downloadPlatform iOS
-          xcodebuild build -project LocalLLMClientExample.xcodeproj -scheme LocalLLMClientExample -destination 'platform=iOS Simulator,name=iPhone 17 Pro,OS=26.2' CODE_SIGN_IDENTITY="-"
+          xcodebuild build -project LocalLLMClientExample.xcodeproj -scheme LocalLLMClientExample -destination 'platform=iOS Simulator,name=iPhone 17 Pro,OS=26.4' CODE_SIGN_IDENTITY="-"
 
   test-ubuntu-x86_64:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/update-dependencies.yml b/.github/workflows/update-dependencies.yml
index 62c200a..4380f1f 100644
--- a/.github/workflows/update-dependencies.yml
+++ b/.github/workflows/update-dependencies.yml
@@ -7,9 +7,9 @@ on:
 
 jobs:
   update-dependencies:
-    runs-on: macos-15
+    runs-on: macos-26
     env:
-      DEVELOPER_DIR: "/Applications/Xcode_16.4.app/Contents/Developer"
+      DEVELOPER_DIR: "/Applications/Xcode_26.4.app/Contents/Developer"
     permissions:
       contents: write
       pull-requests: write
diff --git a/Example/LocalLLMClientExample/AI.swift b/Example/LocalLLMClientExample/AI.swift
index 114ffca..7f5b6d6 100644
--- a/Example/LocalLLMClientExample/AI.swift
+++ b/Example/LocalLLMClientExample/AI.swift
@@ -12,9 +12,11 @@ enum LLMModel: Sendable, CaseIterable, Identifiable {
     case qwen3_4b
     case qwen2_5VL_3b
     case gemma3_4b_mlx
+    case gemma4_e2b_mlx
     case phi4mini
     case gemma3
     case gemma3_4b
+    case gemma4_E2B
     case mobileVLM_3b
 
     static let `default` = qwen3
@@ -25,9 +27,11 @@ enum LLMModel: Sendable, CaseIterable, Identifiable {
         case .qwen3_4b: "MLX / Qwen3 4B"
         case .qwen2_5VL_3b: "MLX / Qwen2.5VL 3B"
         case .gemma3_4b_mlx: "MLX / Gemma3 4B"
+        case .gemma4_e2b_mlx: "MLX / Gemma4 E2B (4bit)"
         case .phi4mini: "llama.cpp / Phi-4 Mini 3.8B"
         case .gemma3: "llama.cpp / Gemma3 1B"
         case .gemma3_4b: "llama.cpp / Gemma3 4B"
+        case .gemma4_E2B: "llama.cpp / Gemma4 E2B"
         case .mobileVLM_3b: "llama.cpp / MobileVLM 3B"
         }
     }
@@ -38,30 +42,35 @@ enum LLMModel: Sendable, CaseIterable, Identifiable {
         case .qwen3_4b: "mlx-community/Qwen3-4B-4bit"
         case .qwen2_5VL_3b: "mlx-community/Qwen2.5-VL-3B-Instruct-abliterated-4bit"
         case .gemma3_4b_mlx: "mlx-community/gemma-3-4b-it-qat-4bit"
+        case .gemma4_e2b_mlx: "mlx-community/gemma-4-e2b-it-4bit"
         case .phi4mini: "unsloth/Phi-4-mini-instruct-GGUF"
         case .gemma3: "lmstudio-community/gemma-3-1B-it-qat-GGUF"
         case .gemma3_4b: "lmstudio-community/gemma-3-4B-it-qat-GGUF"
+        case .gemma4_E2B: "lmstudio-community/gemma-4-E2B-it-GGUF"
         case .mobileVLM_3b: "Blombert/MobileVLM-3B-GGUF"
         }
     }
 
     var filename: String? {
         switch self {
-        case .qwen3, .qwen3_4b, .qwen2_5VL_3b, .gemma3_4b_mlx: nil
+        case .qwen3, .qwen3_4b, .qwen2_5VL_3b, .gemma3_4b_mlx, .gemma4_e2b_mlx: nil
         case .phi4mini: "Phi-4-mini-instruct-Q4_K_M.gguf"
         case .gemma3: "gemma-3-1B-it-QAT-Q4_0.gguf"
         case .gemma3_4b: "gemma-3-4B-it-QAT-Q4_0.gguf"
+        case .gemma4_E2B: "gemma-4-E2B-it-Q4_K_M.gguf"
         case .mobileVLM_3b: "ggml-MobileVLM-3B-q5_k_s.gguf"
         }
     }
 
     var mmprojFilename: String? {
         switch self {
-        case .qwen3, .qwen3_4b, .qwen2_5VL_3b, .gemma3_4b_mlx, .phi4mini, .gemma3: nil
+        case .qwen3, .qwen3_4b, .qwen2_5VL_3b, .gemma3_4b_mlx, .gemma4_e2b_mlx, .phi4mini, .gemma3: nil
 #if os(macOS)
         case .gemma3_4b: "mmproj-model-f16.gguf"
+        case .gemma4_E2B: "mmproj-gemma-4-E4B-it-BF16.gguf"
 #elseif os(iOS)
-        case .gemma3_4b: nil
+        // Total footprint (model + mmproj ≈ 6 GB) exceeds what most iPhones can map; text-only on iOS.
+        case .gemma3_4b, .gemma4_E2B: nil
 #endif
         case .mobileVLM_3b: "mmproj-model-f16.gguf"
         }
@@ -75,11 +84,11 @@ enum LLMModel: Sendable, CaseIterable, Identifiable {
         switch self {
         case .qwen3, .qwen3_4b, .phi4mini, .gemma3: false
 #if os(macOS)
-        case .gemma3_4b: true
+        case .gemma3_4b, .gemma4_E2B: true
 #elseif os(iOS)
-        case .gemma3_4b: false
+        case .gemma3_4b, .gemma4_E2B: false
 #endif
-        case .qwen2_5VL_3b, .gemma3_4b_mlx, .mobileVLM_3b: true
+        case .qwen2_5VL_3b, .gemma3_4b_mlx, .gemma4_e2b_mlx, .mobileVLM_3b: true
         }
     }
 
@@ -87,14 +96,16 @@ enum LLMModel: Sendable, CaseIterable, Identifiable {
         switch self {
         case .gemma3_4b_mlx:
             return ["<end_of_turn>"]
-        case .qwen3, .qwen3_4b, .qwen2_5VL_3b, .phi4mini, .gemma3, .gemma3_4b, .mobileVLM_3b:
+        case .gemma4_e2b_mlx:
+            return ["<turn|>"]
+        case .qwen3, .qwen3_4b, .qwen2_5VL_3b, .phi4mini, .gemma3, .gemma3_4b, .gemma4_E2B, .mobileVLM_3b:
             return []
         }
     }
-    
+
     var supportsTools: Bool {
         switch self {
-        case .qwen3, .qwen3_4b, .phi4mini, .gemma3, .gemma3_4b:
+        case .qwen3, .qwen3_4b, .phi4mini, .gemma3, .gemma3_4b, .gemma4_E2B, .gemma4_e2b_mlx:
             return true
         case .qwen2_5VL_3b, .gemma3_4b_mlx, .mobileVLM_3b:
             return false
diff --git a/Example/LocalLLMClientExample/ChatViewModel.swift b/Example/LocalLLMClientExample/ChatViewModel.swift
index ab7924c..e2685b0 100644
--- a/Example/LocalLLMClientExample/ChatViewModel.swift
+++ b/Example/LocalLLMClientExample/ChatViewModel.swift
@@ -14,9 +14,14 @@ final class ChatViewModel {
     private var ai: AI
     private var generateTask: Task<Void, Never>?
     private var generatingText = ""
+    /// Optimistically displayed user message until it lands in `ai.messages`.
+    private var pendingUserMessage: LLMInput.Message?
 
     var messages: [LLMInput.Message] {
         var messages = ai.messages
+        if let pendingUserMessage, messages.last?.role != .user {
+            messages.append(pendingUserMessage)
+        }
         if !generatingText.isEmpty, messages.last?.role != .assistant {
             messages.append(.assistant(generatingText))
         }
@@ -33,6 +38,7 @@ final class ChatViewModel {
         let currentInput = (text: inputText, images: inputAttachments)
         inputText = ""
         inputAttachments = []
+        pendingUserMessage = .user(currentInput.text, attachments: currentInput.images)
 
         generateTask = Task {
             generatingText = ""
@@ -46,6 +52,7 @@ final class ChatViewModel {
                 (inputText, inputAttachments) = currentInput
             }
 
+            pendingUserMessage = nil
             generateTask = nil
             generatingText = ""
         }
diff --git a/Example/LocalLLMClientExample/Downloader.swift b/Example/LocalLLMClientExample/Downloader.swift
index cd367ff..8ac15d6 100644
--- a/Example/LocalLLMClientExample/Downloader.swift
+++ b/Example/LocalLLMClientExample/Downloader.swift
@@ -5,8 +5,8 @@ struct Downloader: Sendable {
     init(model: LLMModel) {
         self.model = model
         let globs: Globs = switch model {
-        case .qwen3, .qwen3_4b, .qwen2_5VL_3b, .gemma3_4b_mlx: .mlx
-        case .phi4mini, .gemma3, .gemma3_4b, .mobileVLM_3b: .init(
+        case .qwen3, .qwen3_4b, .qwen2_5VL_3b, .gemma3_4b_mlx, .gemma4_e2b_mlx: .mlx
+        case .phi4mini, .gemma3, .gemma3_4b, .gemma4_E2B, .mobileVLM_3b: .init(
             (model.filename.map { [$0] } ?? []) + (model.mmprojFilename.map { [$0] } ?? [])
         )}
 #if os(macOS)
diff --git a/Package.resolved b/Package.resolved
index 917377d..1fe8ed1 100644
--- a/Package.resolved
+++ b/Package.resolved
@@ -1,5 +1,5 @@
 {
-  "originHash" : "c708fe6da241f5f654a397439002b63f3d60f654ef69d1075851dde3f967586f",
+  "originHash" : "f2adb03887aa68beac074e1a360a1252567315c0efcac21bf08bf605120abb14",
   "pins" : [
     {
       "identity" : "eventsource",
@@ -24,7 +24,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/ml-explore/mlx-swift-lm",
       "state" : {
-        "revision" : "2a296f145c3129fea4290bb6e4a0a5fb458efa06"
+        "revision" : "1c05248bb0899e2a7a4962b84d319cf12f4e12aa",
+        "version" : "3.31.3"
       }
     },
     {
@@ -104,8 +105,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/huggingface/swift-jinja",
       "state" : {
-        "revision" : "62b91283572c80a9d79fe77e2fa344cfd9233cfa",
-        "version" : "2.0.2"
+        "revision" : "0aeefadec459ce8e11a333769950fb86183aca43",
+        "version" : "2.3.5"
       }
     },
     {
@@ -129,7 +130,7 @@
     {
       "identity" : "swift-syntax",
       "kind" : "remoteSourceControl",
-      "location" : "https://github.com/swiftlang/swift-syntax.git",
+      "location" : "https://github.com/swiftlang/swift-syntax",
       "state" : {
         "revision" : "0687f71944021d616d34d922343dcef086855920",
         "version" : "600.0.1"
@@ -147,10 +148,10 @@
     {
       "identity" : "swift-transformers",
       "kind" : "remoteSourceControl",
-      "location" : "https://github.com/huggingface/swift-transformers.git",
+      "location" : "https://github.com/huggingface/swift-transformers",
       "state" : {
-        "revision" : "58c4bc11963a140358d791f678a60a2745a23146",
-        "version" : "1.2.1"
+        "revision" : "b38443e44d93eca770f2eb68e2a4d0fa100f9aa2",
+        "version" : "1.3.0"
       }
     },
     {
diff --git a/Package.swift b/Package.swift
index aca59cf..e58e16f 100644
--- a/Package.swift
+++ b/Package.swift
@@ -3,36 +3,21 @@
 import PackageDescription
 import CompilerPluginSupport
 
-let llamaVersion = "b6871"
+let llamaVersion = "b8851"
+let llamaBuildNumber = String(llamaVersion.dropFirst())
 
 // MARK: - Package Dependencies
 
 var packageDependencies: [Package.Dependency] = [
     .package(url: "https://github.com/apple/swift-argument-parser.git", .upToNextMinor(from: "1.4.0")),
-    .package(url: "https://github.com/huggingface/swift-jinja", .upToNextMinor(from: "2.0.0")),
+    .package(url: "https://github.com/huggingface/swift-jinja", from: "2.3.5"),
     .package(url: "https://github.com/swiftlang/swift-syntax", from: "600.0.0")
 ]
 
 #if os(iOS) || os(macOS)
 packageDependencies.append(contentsOf: [
-    // mlx-swift-lm v3 (PR #118 merged 2026-04-01) removed
-    // `loadTokenizer(configuration:hub:)` and reshaped the Hub/Downloader
-    // API; `LocalLLMClientMLX/Context.swift` still uses the old API. Until
-    // the MLX backend is migrated to v3 (`AutoTokenizer.from(directory:)` +
-    // `Downloader`), pin to the last pre-v3 commit so consumers can build.
-    // Tracked in LocalLLMClient#93 — switch back to `branch: "main"` once
-    // Context.swift is migrated.
-    .package(
-        url: "https://github.com/ml-explore/mlx-swift-lm",
-        revision: "2a296f145c3129fea4290bb6e4a0a5fb458efa06"  // 2026-03-27, last pre-v3
-    ),
-    // `Tokenizers` (from swift-transformers) is what `LocalLLMClientMLX`
-    // imports for `any Tokenizer`. Pre-v3 mlx-swift-lm transitively pulled
-    // swift-transformers in, but its Package.swift didn't declare it as a
-    // public re-export, so consumers still need to depend on it directly.
-    // Range matches the pre-v3 mlx-swift-lm transitive pin so SPM resolves.
-    // Bump to `from: "1.3.0"` once Context.swift is migrated to mlx-swift-lm v3.
-    .package(url: "https://github.com/huggingface/swift-transformers.git", "1.2.0"..<"1.3.0"),
+    .package(url: "https://github.com/ml-explore/mlx-swift-lm", from: "3.31.3"),
+    .package(url: "https://github.com/huggingface/swift-transformers", from: "1.3.0"),
     .package(url: "https://github.com/apple/swift-docc-plugin", from: "1.4.0")
 ])
 #endif
@@ -152,6 +137,7 @@ packageTargets.append(contentsOf: [
             "LocalLLMClientCore",
             .product(name: "MLXLLM", package: "mlx-swift-lm"),
             .product(name: "MLXVLM", package: "mlx-swift-lm"),
+            .product(name: "MLXHuggingFace", package: "mlx-swift-lm"),
             .product(name: "Tokenizers", package: "swift-transformers"),
         ],
     ),
@@ -172,7 +158,7 @@ packageTargets.append(contentsOf: [
         name: "LocalLLMClientLlamaFramework",
         url:
             "https://github.com/ggml-org/llama.cpp/releases/download/\(llamaVersion)/llama-\(llamaVersion)-xcframework.zip",
-        checksum: "ac657d70112efadbf5cd1db5c4f67eea94ca38556ada9e7442d5a5a461010d6f"
+        checksum: "f5eb26820b9890ae026aee4963cd4f43af1c567d39534012f2685601a59c2519"
     ),
     .target(
         name: "LocalLLMClientLlamaC",
@@ -180,10 +166,15 @@ packageTargets.append(contentsOf: [
         exclude: ["exclude"],
         cSettings: [
             .unsafeFlags(["-w"]),
-            .headerSearchPath(".")
+            .define("LLAMA_BUILD_NUMBER", to: llamaBuildNumber),
+            .headerSearchPath("."),
+            .headerSearchPath("common")
         ],
         cxxSettings: [
-            .headerSearchPath(".")
+            .unsafeFlags(["-UDEBUG"]),
+            .define("LLAMA_BUILD_NUMBER", to: llamaBuildNumber),
+            .headerSearchPath("."),
+            .headerSearchPath("common")
         ],
         swiftSettings: [
             .interoperabilityMode(.Cxx)
@@ -194,7 +185,8 @@ packageTargets.append(contentsOf: [
         name: "LocalLLMClientUtilityTests",
         dependencies: [
             "LocalLLMClientUtility",
-            .product(name: "MLXLMCommon", package: "mlx-swift-lm")
+            .product(name: "MLXLMCommon", package: "mlx-swift-lm"),
+            .product(name: "Hub", package: "swift-transformers"),
         ]
     )
 ])
diff --git a/Sources/LocalLLMClientLlama/LlamaClient.swift b/Sources/LocalLLMClientLlama/LlamaClient.swift
index 0ddd82e..14e0320 100644
--- a/Sources/LocalLLMClientLlama/LlamaClient.swift
+++ b/Sources/LocalLLMClientLlama/LlamaClient.swift
@@ -10,9 +10,11 @@ public final class LlamaClient: LLMClient {
     private let multimodal: MultimodalContext?
     private let messageProcessor: MessageProcessor
     let tools: [AnyLLMTool]
+    /// Owned by this client; freed in `deinit`.
+    nonisolated(unsafe) private let chatParamsPtr: UnsafeMutablePointer<llm_chat_params>?
 
     var chatFormat: common_chat_format {
-        context.model.chatFormat()
+        get_chat_params_format(chatParamsPtr)
     }
 
     /// Initializes a new Llama client.
@@ -38,7 +40,15 @@ public final class LlamaClient: LLMClient {
             multimodal = nil
         }
         self.messageProcessor = messageProcessor ?? MessageProcessorFactory.createAutoProcessor(chatTemplate: context.model.chatTemplate)
-        self.tools = tools.map { AnyLLMTool($0) }
+        let wrappedTools = tools.map { AnyLLMTool($0) }
+        self.tools = wrappedTools
+        self.chatParamsPtr = context.model.buildChatParams(tools: wrappedTools)
+    }
+
+    deinit {
+        if let chatParamsPtr {
+            free_chat_params(chatParamsPtr)
+        }
     }
 
     /// Generates a text stream from the given input.
@@ -82,8 +92,7 @@ public final class LlamaClient: LLMClient {
     public func responseStream(from input: LLMInput) async throws -> AsyncThrowingStream<StreamingChunk, any Error> {
         // Create the stream first (this can throw)
         let textStreamGenerator = try textStream(from: input)
-        let chatFormat = self.chatFormat
-        
+
         return AsyncThrowingStream { continuation in
             let processor = StreamingToolCallProcessor(
                 startTag: getToolCallStartTag(),
@@ -103,7 +112,11 @@ public final class LlamaClient: LLMClient {
                         }
                     }
 
-                    var toolCalls = processor.toolCalls + (LlamaToolCallParser.parseToolCalls(from: fullText, format: chatFormat) ?? [])
+                    let parserToolCalls = LlamaToolCallParser.parseToolCalls(
+                        from: fullText,
+                        chatParams: chatParamsPtr
+                    ) ?? []
+                    var toolCalls = processor.toolCalls + parserToolCalls
                     toolCalls = toolCalls.reduce(into: []) { result, toolCall in
                         if !result.contains(where: { $0.name == toolCall.name }) {
                             result.append(toolCall)
@@ -124,17 +137,19 @@ public final class LlamaClient: LLMClient {
     
     /// Get the tool call start tag based on chat format
     private func getToolCallStartTag() -> String {
-        // Different chat formats may use different tags
         switch chatFormat {
+        case COMMON_CHAT_FORMAT_PEG_GEMMA4:
+            return "<|tool_call>"
         default:
             return "<tool_call>"
         }
     }
-    
+
     /// Get the tool call end tag based on chat format
     private func getToolCallEndTag() -> String {
-        // Different chat formats may use different tags
         switch chatFormat {
+        case COMMON_CHAT_FORMAT_PEG_GEMMA4:
+            return "<tool_call|>"
         default:
             return "</tool_call>"
         }
diff --git a/Sources/LocalLLMClientLlama/LlamaToolCallParser.swift b/Sources/LocalLLMClientLlama/LlamaToolCallParser.swift
index af29f68..c0b3b94 100644
--- a/Sources/LocalLLMClientLlama/LlamaToolCallParser.swift
+++ b/Sources/LocalLLMClientLlama/LlamaToolCallParser.swift
@@ -11,49 +11,33 @@ import LocalLLMClientCore
 
 /// A utility to parse tool calls from a response generated by a model using llama.cpp's common_chat_parse
 struct LlamaToolCallParser {
-    /// Parses a string for tool calls with a specific chat format
+    /// Parses a string for tool calls using the parser context derived from the model's chat template.
     ///
     /// - Parameters:
     ///   - response: The string to parse for tool calls
-    ///   - format: The specific chat format to use for parsing
+    ///   - chatParams: Pointer to the model-owned `llm_chat_params` (obtained via `Model.chatParams()`).
     /// - Returns: An array of LLMToolCall objects if any were found, otherwise nil
-    public static func parseToolCalls(from response: String, format: common_chat_format) -> [LLMToolCall]? {
-        var syntax = common_chat_syntax()
-        syntax.format = format
-        syntax.reasoning_format = COMMON_REASONING_FORMAT_NONE
-        syntax.reasoning_in_content = false
-        syntax.thinking_forced_open = false
-        syntax.parse_tool_calls = true
-        
-        let parsedMessage = common_chat_parse(std.string(response), false, syntax)
+    public static func parseToolCalls(from response: String, chatParams: UnsafeMutablePointer<llm_chat_params>?) -> [LLMToolCall]? {
+        guard let chatParams else { return nil }
+
+        let parsedMessage = response.withCString { cstr in
+            parse_chat_response(chatParams, cstr, false)
+        }
         guard !parsedMessage.tool_calls.empty() else {
             return nil
         }
-        
+
         var toolCalls: [LLMToolCall] = []
-        
         for i in 0..<parsedMessage.tool_calls.size() {
             let cppToolCall = parsedMessage.tool_calls[i]
-            
-            let id: String
-            if cppToolCall.id.empty() {
-                id = UUID().uuidString
-            } else {
-                id = String(cppToolCall.id)
-            }
-            
+
+            let id = cppToolCall.id.empty() ? UUID().uuidString : String(cppToolCall.id)
             let name = String(cppToolCall.name)
             let arguments = String(cppToolCall.arguments)
-            
-            let toolCall = LLMToolCall(
-                id: id,
-                name: name,
-                arguments: arguments
-            )
-            
-            toolCalls.append(toolCall)
+
+            toolCalls.append(LLMToolCall(id: id, name: name, arguments: arguments))
         }
-        
+
         return toolCalls.isEmpty ? nil : toolCalls
     }
 }
diff --git a/Sources/LocalLLMClientLlama/MessageProcessing/MessageProcessor.swift b/Sources/LocalLLMClientLlama/MessageProcessing/MessageProcessor.swift
index a09d226..b189e44 100644
--- a/Sources/LocalLLMClientLlama/MessageProcessing/MessageProcessor.swift
+++ b/Sources/LocalLLMClientLlama/MessageProcessing/MessageProcessor.swift
@@ -186,7 +186,16 @@ public struct MessageProcessorFactory {
             chunkExtractor: RegexChunkExtractor(imageTokenPattern: "<start_of_image>")
         )
     }
-    
+
+    /// Create a processor for Gemma4 models
+    public static func gemma4Processor() -> MessageProcessor {
+        MessageProcessor(
+            transformer: StandardMessageTransformer(),
+            renderer: JinjaChatTemplateRenderer(),
+            chunkExtractor: RegexChunkExtractor(imageTokenPattern: "<\\|image\\|>")
+        )
+    }
+
     /// Create a processor for SmolVLM models  
     public static func smolVLMProcessor() -> MessageProcessor {
         MessageProcessor(
@@ -199,7 +208,10 @@ public struct MessageProcessorFactory {
     /// Create an auto-detecting processor based on chat template
     public static func createAutoProcessor(chatTemplate: String) -> MessageProcessor {
         // Check for specific template patterns
-        if chatTemplate.contains("<|im_start|>") && chatTemplate.contains("<end_of_utterance>") {
+        if chatTemplate.contains("<|turn>") {
+            // Gemma4 format (checked first because the template also contains content-type checks similar to Qwen2VL)
+            return gemma4Processor()
+        } else if chatTemplate.contains("<|im_start|>") && chatTemplate.contains("<end_of_utterance>") {
             // SmolVLM format
             return smolVLMProcessor()
         } else if chatTemplate.contains("content[i].type == 'image'") || chatTemplate.contains("<vision>") {
diff --git a/Sources/LocalLLMClientLlama/Model.swift b/Sources/LocalLLMClientLlama/Model.swift
index 5a8714d..a6493db 100644
--- a/Sources/LocalLLMClientLlama/Model.swift
+++ b/Sources/LocalLLMClientLlama/Model.swift
@@ -24,7 +24,7 @@ final class Model {
 
         self.model = model
 
-        let chatTemplate = getString(capacity: 8192) { buffer, length in
+        let chatTemplate = getString { buffer, length in
             // LLM_KV_TOKENIZER_CHAT_TEMPLATE
             llama_model_meta_val_str(model, "tokenizer.chat_template", buffer, length)
         }
@@ -47,33 +47,56 @@ final class Model {
     func tokenizerConfigs() -> [String: Any] {
         let numberOfConfigs = llama_model_meta_count(model)
         return (0..<numberOfConfigs).reduce(into: [:]) { partialResult, i in
-            let key = getString(capacity: 64) { buffer, length in
+            let key = getString(minimumCapacity: 64) { buffer, length in
                 llama_model_meta_key_by_index(model, i, buffer, length)
             }
-            let value = getString(capacity: 2048) { buffer, length in
+            let value = getString(minimumCapacity: 2048) { buffer, length in
                 llama_model_meta_val_str_by_index(model, i, buffer, length)
             }
             partialResult[key] = value
         }
     }
 
-    func chatFormat() -> common_chat_format {
+    /// Build a chat parser context for this model using the provided tools.
+    ///
+    /// In the PEG-grammar era of llama.cpp, the generated parser depends on both
+    /// the chat template and the tool list, so ownership belongs to whoever has
+    /// the tool list (i.e. `LlamaClient`), not the `Model` itself.
+    ///
+    /// The returned pointer must be freed with `free_chat_params`.
+    func buildChatParams(tools: [AnyLLMTool]) -> UnsafeMutablePointer<llm_chat_params>? {
         let inputs = create_chat_templates_inputs()
-        defer { 
+        defer {
             free_chat_templates_inputs(inputs)
         }
-
-        add_message_to_inputs(inputs, "user", "test")
-        let params = apply_chat_templates_with_model(model, inputs)
-        return params.format
+        add_message_to_inputs(inputs, "user", "probe")
+        for tool in tools {
+            let oaiJSON = tool.toOAICompatJSON()
+            guard let function = oaiJSON["function"] as? [String: Any],
+                  let name = function["name"] as? String else { continue }
+            let description = function["description"] as? String ?? ""
+            let parametersJSON: String
+            if let parameters = function["parameters"],
+               let data = try? JSONSerialization.data(withJSONObject: parameters),
+               let str = String(data: data, encoding: .utf8) {
+                parametersJSON = str
+            } else {
+                parametersJSON = "{}"
+            }
+            add_tool_to_inputs(inputs, name, description, parametersJSON)
+        }
+        return create_chat_params(model, inputs)
     }
 }
 
-private func getString(capacity: Int = 1024, getter: (UnsafeMutablePointer<CChar>?, Int) -> Int32) -> String {
-    String(unsafeUninitializedCapacity: capacity) { buffer in
+private func getString(minimumCapacity: Int = 1024, getter: (UnsafeMutablePointer<CChar>?, Int) -> Int32) -> String {
+    var probe: CChar = 0
+    let required = Int(getter(&probe, 1))
+    let capacity = max(minimumCapacity, required + 1)
+    return String(unsafeUninitializedCapacity: capacity) { buffer in
         buffer.withMemoryRebound(to: CChar.self) { buffer in
             let length = Int(getter(buffer.baseAddress, capacity))
-            return max(0, length)
+            return max(0, min(length, capacity))
         }
     }
 }
diff --git a/Sources/LocalLLMClientLlama/Multimodal.swift b/Sources/LocalLLMClientLlama/Multimodal.swift
index 4bc31e4..50246a6 100644
--- a/Sources/LocalLLMClientLlama/Multimodal.swift
+++ b/Sources/LocalLLMClientLlama/Multimodal.swift
@@ -13,7 +13,6 @@ public class MultimodalContext: @unchecked Sendable {
         if let numberOfThreads = parameter.numberOfThreads {
             mparams.n_threads = Int32(numberOfThreads)
         }
-        mparams.verbosity = parameter.options.verbose ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_CONT;
         guard let multimodalContext = mtmd_init_from_file(url.path(percentEncoded: false), context.model.model, mparams) else {
             throw .failedToLoad(reason: "Failed to load the mmproj file")
         }
@@ -40,7 +39,7 @@ public class MultimodalContext: @unchecked Sendable {
 
         let chunks = mtmd_input_chunks_init()!
 
-        let textStorage = "    \(MTMD_DEFAULT_IMAGE_MARKER)    " // spaces for the workaround of tokenizer
+        let textStorage = "    \(String(cString: mtmd_default_marker()))    " // spaces for the workaround of tokenizer
         var text = textStorage.withCString {
             mtmd_input_text(text: $0, add_special: false, parse_special: true)
         }
diff --git a/Sources/LocalLLMClientLlamaC/clip-graph.h b/Sources/LocalLLMClientLlamaC/clip-graph.h
new file mode 120000
index 0000000..0027171
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/clip-graph.h
@@ -0,0 +1 @@
+exclude/llama.cpp/tools/mtmd/clip-graph.h
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/clip-model.h b/Sources/LocalLLMClientLlamaC/clip-model.h
new file mode 120000
index 0000000..4b450cb
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/clip-model.h
@@ -0,0 +1 @@
+exclude/llama.cpp/tools/mtmd/clip-model.h
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/common/build-info.cpp b/Sources/LocalLLMClientLlamaC/common/build-info.cpp
new file mode 100644
index 0000000..44bc8be
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/common/build-info.cpp
@@ -0,0 +1,14 @@
+#include "build-info.h"
+
+#ifndef LLAMA_BUILD_NUMBER
+#define LLAMA_BUILD_NUMBER 0
+#endif
+
+#define LLAMA_BUILD_INFO_STRINGIFY_(x) #x
+#define LLAMA_BUILD_INFO_STRINGIFY(x) LLAMA_BUILD_INFO_STRINGIFY_(x)
+
+int llama_build_number(void) { return LLAMA_BUILD_NUMBER; }
+const char * llama_commit(void) { return ""; }
+const char * llama_compiler(void) { return ""; }
+const char * llama_build_target(void) { return ""; }
+const char * llama_build_info(void) { return "b" LLAMA_BUILD_INFO_STRINGIFY(LLAMA_BUILD_NUMBER); }
diff --git a/Sources/LocalLLMClientLlamaC/common/build-info.h b/Sources/LocalLLMClientLlamaC/common/build-info.h
new file mode 120000
index 0000000..b4f9fbe
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/common/build-info.h
@@ -0,0 +1 @@
+../exclude/llama.cpp/common/build-info.h
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/common/chat-auto-parser-generator.cpp b/Sources/LocalLLMClientLlamaC/common/chat-auto-parser-generator.cpp
new file mode 120000
index 0000000..c1664cb
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/common/chat-auto-parser-generator.cpp
@@ -0,0 +1 @@
+../exclude/llama.cpp/common/chat-auto-parser-generator.cpp
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/common/chat-auto-parser-helpers.cpp b/Sources/LocalLLMClientLlamaC/common/chat-auto-parser-helpers.cpp
new file mode 120000
index 0000000..a9da30b
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/common/chat-auto-parser-helpers.cpp
@@ -0,0 +1 @@
+../exclude/llama.cpp/common/chat-auto-parser-helpers.cpp
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/common/chat-auto-parser-helpers.h b/Sources/LocalLLMClientLlamaC/common/chat-auto-parser-helpers.h
new file mode 120000
index 0000000..0f9c92b
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/common/chat-auto-parser-helpers.h
@@ -0,0 +1 @@
+../exclude/llama.cpp/common/chat-auto-parser-helpers.h
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/common/chat-auto-parser.h b/Sources/LocalLLMClientLlamaC/common/chat-auto-parser.h
new file mode 120000
index 0000000..d1c0da5
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/common/chat-auto-parser.h
@@ -0,0 +1 @@
+../exclude/llama.cpp/common/chat-auto-parser.h
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/common/chat-diff-analyzer.cpp b/Sources/LocalLLMClientLlamaC/common/chat-diff-analyzer.cpp
new file mode 120000
index 0000000..0af8837
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/common/chat-diff-analyzer.cpp
@@ -0,0 +1 @@
+../exclude/llama.cpp/common/chat-diff-analyzer.cpp
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/common/chat-parser.cpp b/Sources/LocalLLMClientLlamaC/common/chat-parser.cpp
deleted file mode 120000
index 1304458..0000000
--- a/Sources/LocalLLMClientLlamaC/common/chat-parser.cpp
+++ /dev/null
@@ -1 +0,0 @@
-../exclude/llama.cpp/common/chat-parser.cpp
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/common/chat-parser.h b/Sources/LocalLLMClientLlamaC/common/chat-parser.h
deleted file mode 120000
index 0ffce12..0000000
--- a/Sources/LocalLLMClientLlamaC/common/chat-parser.h
+++ /dev/null
@@ -1 +0,0 @@
-../exclude/llama.cpp/common/chat-parser.h
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/common/chat-peg-parser.cpp b/Sources/LocalLLMClientLlamaC/common/chat-peg-parser.cpp
new file mode 120000
index 0000000..6ffe432
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/common/chat-peg-parser.cpp
@@ -0,0 +1 @@
+../exclude/llama.cpp/common/chat-peg-parser.cpp
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/common/chat-peg-parser.h b/Sources/LocalLLMClientLlamaC/common/chat-peg-parser.h
new file mode 120000
index 0000000..3c8dd29
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/common/chat-peg-parser.h
@@ -0,0 +1 @@
+../exclude/llama.cpp/common/chat-peg-parser.h
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/common/log.cpp b/Sources/LocalLLMClientLlamaC/common/log.cpp
index ec3b36e..ca48376 100644
--- a/Sources/LocalLLMClientLlamaC/common/log.cpp
+++ b/Sources/LocalLLMClientLlamaC/common/log.cpp
@@ -4,3 +4,4 @@ int common_log_verbosity_thold = 0;
 
 struct common_log * common_log_main() { return nullptr; }
 void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...) {}
+void common_log_default_callback(enum ggml_log_level level, const char * text, void * user_data) {}
diff --git a/Sources/LocalLLMClientLlamaC/common/log.h b/Sources/LocalLLMClientLlamaC/common/log.h
index 69c2ad7..9b4f1e7 100644
--- a/Sources/LocalLLMClientLlamaC/common/log.h
+++ b/Sources/LocalLLMClientLlamaC/common/log.h
@@ -15,3 +15,4 @@ struct common_log;
 struct common_log * common_log_init();
 struct common_log * common_log_main();
 void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...);
+void common_log_default_callback(enum ggml_log_level level, const char * text, void * user_data);
diff --git a/Sources/LocalLLMClientLlamaC/common/peg-parser.cpp b/Sources/LocalLLMClientLlamaC/common/peg-parser.cpp
new file mode 120000
index 0000000..3ce9c4e
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/common/peg-parser.cpp
@@ -0,0 +1 @@
+../exclude/llama.cpp/common/peg-parser.cpp
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/common/peg-parser.h b/Sources/LocalLLMClientLlamaC/common/peg-parser.h
new file mode 120000
index 0000000..be21892
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/common/peg-parser.h
@@ -0,0 +1 @@
+../exclude/llama.cpp/common/peg-parser.h
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/common/reasoning-budget.cpp b/Sources/LocalLLMClientLlamaC/common/reasoning-budget.cpp
new file mode 120000
index 0000000..b2d4a33
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/common/reasoning-budget.cpp
@@ -0,0 +1 @@
+../exclude/llama.cpp/common/reasoning-budget.cpp
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/common/reasoning-budget.h b/Sources/LocalLLMClientLlamaC/common/reasoning-budget.h
new file mode 120000
index 0000000..dc1642e
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/common/reasoning-budget.h
@@ -0,0 +1 @@
+../exclude/llama.cpp/common/reasoning-budget.h
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/common/sampling.cpp b/Sources/LocalLLMClientLlamaC/common/sampling.cpp
new file mode 120000
index 0000000..1e54e29
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/common/sampling.cpp
@@ -0,0 +1 @@
+../exclude/llama.cpp/common/sampling.cpp
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/common/sampling.h b/Sources/LocalLLMClientLlamaC/common/sampling.h
new file mode 120000
index 0000000..fa7f215
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/common/sampling.h
@@ -0,0 +1 @@
+../exclude/llama.cpp/common/sampling.h
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/common/unicode.cpp b/Sources/LocalLLMClientLlamaC/common/unicode.cpp
new file mode 120000
index 0000000..114dea4
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/common/unicode.cpp
@@ -0,0 +1 @@
+../exclude/llama.cpp/common/unicode.cpp
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/common/unicode.h b/Sources/LocalLLMClientLlamaC/common/unicode.h
new file mode 120000
index 0000000..cac46ea
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/common/unicode.h
@@ -0,0 +1 @@
+../exclude/llama.cpp/common/unicode.h
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/debug/mtmd-debug.h b/Sources/LocalLLMClientLlamaC/debug/mtmd-debug.h
new file mode 120000
index 0000000..ffe5f85
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/debug/mtmd-debug.h
@@ -0,0 +1 @@
+../exclude/llama.cpp/tools/mtmd/debug/mtmd-debug.h
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/exclude/llama.cpp b/Sources/LocalLLMClientLlamaC/exclude/llama.cpp
index 9a3ea68..e365e65 160000
--- a/Sources/LocalLLMClientLlamaC/exclude/llama.cpp
+++ b/Sources/LocalLLMClientLlamaC/exclude/llama.cpp
@@ -1 +1 @@
-Subproject commit 9a3ea685b937c0f0cbfda2e50004ea54bf187512
+Subproject commit e365e658f07b63371489570dfde597f199b26c23
diff --git a/Sources/LocalLLMClientLlamaC/include/jinja b/Sources/LocalLLMClientLlamaC/include/jinja
new file mode 120000
index 0000000..14e028e
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/include/jinja
@@ -0,0 +1 @@
+../exclude/llama.cpp/common/jinja
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/include/module.modulemap b/Sources/LocalLLMClientLlamaC/include/module.modulemap
new file mode 100644
index 0000000..bdea50a
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/include/module.modulemap
@@ -0,0 +1,15 @@
+module LocalLLMClientLlamaC {
+    header "LocalLLMClientLlamaC.h"
+    header "clip.h"
+    header "ggml-alloc.h"
+    header "ggml-backend.h"
+    header "ggml-cpu.h"
+    header "ggml-opt.h"
+    header "ggml.h"
+    header "gguf.h"
+    header "llama.h"
+    header "mtmd-helper.h"
+    header "mtmd.h"
+    header "utils.h"
+    export *
+}
diff --git a/Sources/LocalLLMClientLlamaC/include/nlohmann b/Sources/LocalLLMClientLlamaC/include/nlohmann
new file mode 120000
index 0000000..8b02d58
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/include/nlohmann
@@ -0,0 +1 @@
+../exclude/llama.cpp/vendor/nlohmann
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/include/utils.h b/Sources/LocalLLMClientLlamaC/include/utils.h
index 4ed6570..90124d8 100644
--- a/Sources/LocalLLMClientLlamaC/include/utils.h
+++ b/Sources/LocalLLMClientLlamaC/include/utils.h
@@ -11,6 +11,26 @@ common_chat_templates* get_common_chat_templates(const common_chat_templates_ptr
 // Wrapper functions for Swift C++ interop
 common_chat_templates_inputs* create_chat_templates_inputs();
 void add_message_to_inputs(common_chat_templates_inputs* inputs, const char* role, const char* content);
+void add_tool_to_inputs(common_chat_templates_inputs* inputs, const char* name, const char* description, const char* parameters_json);
 common_chat_params apply_chat_templates_safe(const common_chat_templates* tmpls, common_chat_templates_inputs* inputs);
 common_chat_params apply_chat_templates_with_model(const struct llama_model* model, common_chat_templates_inputs* inputs);
 void free_chat_templates_inputs(common_chat_templates_inputs* inputs);
+
+// Heap-allocated common_chat_params plus pre-built PEG arena.
+// The `parser` string on common_chat_params is the serialized PEG grammar;
+// callers need the deserialized common_peg_arena to run common_chat_parse.
+// Keeping both together lets the model own a stable pointer Swift can retain.
+struct llm_chat_params {
+    common_chat_params        chat_params;
+    common_chat_parser_params parser_params;
+};
+
+// Build a llm_chat_params from the model and a probing message set.
+// Returns nullptr if the template cannot be applied.
+llm_chat_params* create_chat_params(const struct llama_model* model, common_chat_templates_inputs* inputs);
+void free_chat_params(llm_chat_params* params);
+common_chat_format get_chat_params_format(const llm_chat_params* params);
+
+// Parse a response using the pre-built parser params. Returns a message whose
+// tool_calls vector is populated when the response matches the grammar.
+common_chat_msg parse_chat_response(const llm_chat_params* params, const char* response, bool is_partial);
diff --git a/Sources/LocalLLMClientLlamaC/minja b/Sources/LocalLLMClientLlamaC/minja
deleted file mode 120000
index 03ae71a..0000000
--- a/Sources/LocalLLMClientLlamaC/minja
+++ /dev/null
@@ -1 +0,0 @@
-exclude/llama.cpp/vendor/minja
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/models b/Sources/LocalLLMClientLlamaC/models
new file mode 120000
index 0000000..aaaed15
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/models
@@ -0,0 +1 @@
+exclude/llama.cpp/tools/mtmd/models
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/mtmd-image.cpp b/Sources/LocalLLMClientLlamaC/mtmd-image.cpp
new file mode 120000
index 0000000..6c0c387
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/mtmd-image.cpp
@@ -0,0 +1 @@
+exclude/llama.cpp/tools/mtmd/mtmd-image.cpp
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/mtmd-image.h b/Sources/LocalLLMClientLlamaC/mtmd-image.h
new file mode 120000
index 0000000..5db91cd
--- /dev/null
+++ b/Sources/LocalLLMClientLlamaC/mtmd-image.h
@@ -0,0 +1 @@
+exclude/llama.cpp/tools/mtmd/mtmd-image.h
\ No newline at end of file
diff --git a/Sources/LocalLLMClientLlamaC/utils.cpp b/Sources/LocalLLMClientLlamaC/utils.cpp
index 0e0c108..9fb4923 100644
--- a/Sources/LocalLLMClientLlamaC/utils.cpp
+++ b/Sources/LocalLLMClientLlamaC/utils.cpp
@@ -1,5 +1,7 @@
 #include "include/utils.h"
 
+#include <cstdio>
+
 template<typename T, typename Deleter>
 void* get_raw_pointer_from_unique_ptr(const std::unique_ptr<T, Deleter>& ptr) {
     return static_cast<void*>(ptr.get());
@@ -26,6 +28,16 @@ void add_message_to_inputs(common_chat_templates_inputs* inputs, const char* rol
     }
 }
 
+void add_tool_to_inputs(common_chat_templates_inputs* inputs, const char* name, const char* description, const char* parameters_json) {
+    if (inputs && name) {
+        common_chat_tool tool;
+        tool.name        = std::string(name);
+        tool.description = description ? std::string(description) : std::string();
+        tool.parameters  = parameters_json ? std::string(parameters_json) : std::string("{}");
+        inputs->tools.push_back(std::move(tool));
+    }
+}
+
 common_chat_params apply_chat_templates_safe(const common_chat_templates* tmpls, common_chat_templates_inputs* inputs) {
     if (tmpls && inputs) {
         return common_chat_templates_apply(tmpls, *inputs);
@@ -46,3 +58,43 @@ common_chat_params apply_chat_templates_with_model(const struct llama_model* mod
 void free_chat_templates_inputs(common_chat_templates_inputs* inputs) {
     delete inputs;
 }
+
+llm_chat_params* create_chat_params(const struct llama_model* model, common_chat_templates_inputs* inputs) {
+    if (!model || !inputs) {
+        return nullptr;
+    }
+    auto templates = common_chat_templates_init(model, "", "", "");
+    if (!templates) {
+        return nullptr;
+    }
+    auto* out = new llm_chat_params{};
+    out->chat_params = common_chat_templates_apply(templates.get(), *inputs);
+    out->parser_params = common_chat_parser_params(out->chat_params);
+    if (!out->chat_params.parser.empty()) {
+        out->parser_params.parser.load(out->chat_params.parser);
+    }
+    return out;
+}
+
+void free_chat_params(llm_chat_params* params) {
+    delete params;
+}
+
+common_chat_format get_chat_params_format(const llm_chat_params* params) {
+    return params ? params->chat_params.format : COMMON_CHAT_FORMAT_CONTENT_ONLY;
+}
+
+common_chat_msg parse_chat_response(const llm_chat_params* params, const char* response, bool is_partial) {
+    if (!params || !response) {
+        return {};
+    }
+    try {
+        return common_chat_parse(response, is_partial, params->parser_params);
+    } catch (const std::exception & e) {
+        // Grammar-mismatched input throws from the PEG parser. Treat it as "no
+        // tool calls" but surface the reason on stderr so genuine errors
+        // (allocation failure, invariant violations, ...) remain diagnosable.
+        fprintf(stderr, "[LocalLLMClient] parse_chat_response: %s\n", e.what());
+        return {};
+    }
+}
diff --git a/Sources/LocalLLMClientMLX/Context.swift b/Sources/LocalLLMClientMLX/Context.swift
index 0a3ae75..30c2308 100644
--- a/Sources/LocalLLMClientMLX/Context.swift
+++ b/Sources/LocalLLMClientMLX/Context.swift
@@ -4,6 +4,7 @@ import LocalLLMClientCore
 import MLX
 import MLXLLM
 import MLXLMCommon
+import MLXHuggingFace
 import Tokenizers
 
 public final class Context: Sendable {
@@ -36,7 +37,7 @@ public final class Context: Sendable {
 
     private static func loadModel(
         url: URL, configuration: ModelConfiguration
-    ) async throws(LLMError) -> (any LanguageModel, any Tokenizer) {
+    ) async throws(LLMError) -> (any LanguageModel, any MLXLMCommon.Tokenizer) {
         do {
             let configurationURL = url.appending(component: "config.json")
             let configurationData = try Data(contentsOf: configurationURL)
@@ -58,7 +59,8 @@ public final class Context: Sendable {
 
             try loadWeights(modelDirectory: url, model: model, perLayerQuantization: baseConfiguration.perLayerQuantization)
 
-            let tokenizer = try await loadTokenizer(configuration: configuration, hub: .shared)
+            let tokenizerLoader: any MLXLMCommon.TokenizerLoader = #huggingFaceTokenizerLoader()
+            let tokenizer = try await tokenizerLoader.load(from: url)
             return (model, tokenizer)
         } catch {
             throw .failedToLoad(reason: error.localizedDescription)
@@ -66,13 +68,15 @@ public final class Context: Sendable {
     }
 
     private static func makeProcessor(
-        url: URL, configuration: ModelConfiguration, tokenizer: any Tokenizer,
+        url: URL, configuration: ModelConfiguration, tokenizer: any MLXLMCommon.Tokenizer,
     ) async -> (any UserInputProcessor, Bool) {
         do {
-            let processorConfiguration = url.appending(
-                component: "preprocessor_config.json"
-            )
-            let configurationData = try Data(contentsOf: processorConfiguration)
+            let preprocessorURL = url.appending(component: "preprocessor_config.json")
+            let processorURL = url.appending(component: "processor_config.json")
+            let configURL = FileManager.default.fileExists(atPath: preprocessorURL.path)
+                ? preprocessorURL
+                : processorURL
+            let configurationData = try Data(contentsOf: configURL)
             let baseProcessorConfig = try JSONDecoder().decode(
                 BaseProcessorConfiguration.self,
                 from: configurationData
@@ -94,12 +98,12 @@ public final class Context: Sendable {
 }
 
 private struct LLMUserInputProcessor: UserInputProcessor {
-    let tokenizer: Tokenizer
+    let tokenizer: MLXLMCommon.Tokenizer
     let configuration: ModelConfiguration
     let messageGenerator: MessageGenerator
 
     init(
-        tokenizer: any Tokenizer, configuration: ModelConfiguration,
+        tokenizer: any MLXLMCommon.Tokenizer, configuration: ModelConfiguration,
         messageGenerator: MessageGenerator
     ) {
         self.tokenizer = tokenizer
diff --git a/Sources/LocalLLMClientMLX/Utility.swift b/Sources/LocalLLMClientMLX/Utility.swift
index e202c97..9b835c4 100644
--- a/Sources/LocalLLMClientMLX/Utility.swift
+++ b/Sources/LocalLLMClientMLX/Utility.swift
@@ -9,5 +9,5 @@ nonisolated(unsafe) private var isMLXInitialized = false
 public func initializeMLX() {
     guard !isMLXInitialized else { return }
     isMLXInitialized = true
-    MLX.GPU.set(cacheLimit: 20 * 1024 * 1024)
+    MLX.Memory.cacheLimit = 20 * 1024 * 1024
 }
diff --git a/Sources/LocalLLMClientUtility/Globs.swift b/Sources/LocalLLMClientUtility/Globs.swift
index 08b4e2f..3fb86f1 100644
--- a/Sources/LocalLLMClientUtility/Globs.swift
+++ b/Sources/LocalLLMClientUtility/Globs.swift
@@ -9,8 +9,10 @@ public struct Globs: Sendable, Equatable {
         self.rawValue = globs
     }
 
-    /// Default glob patterns for MLX models, typically including "*.safetensors" and "*.json".
-    public static let mlx = Globs(["*.safetensors", "*.json"])
+    /// Default glob patterns for MLX models, covering weights, config JSON and
+    /// the chat template (newer models such as Gemma 4 ship it as a separate
+    /// `chat_template.jinja` file instead of inlining it in `tokenizer_config.json`).
+    public static let mlx = Globs(["*.safetensors", "*.json", "*.jinja"])
 
     /// Appends a new glob pattern to the set.
     /// - Parameter glob: A string representing a glob pattern to be added.
diff --git a/Tests/LocalLLMClientLlamaTests/LlamaToolCallParserTests.swift b/Tests/LocalLLMClientLlamaTests/LlamaToolCallParserTests.swift
deleted file mode 100644
index 2980f86..0000000
--- a/Tests/LocalLLMClientLlamaTests/LlamaToolCallParserTests.swift
+++ /dev/null
@@ -1,146 +0,0 @@
-import Testing
-import Foundation
-import LocalLLMClientCore
-@testable import LocalLLMClientLlama
-
-@Suite
-struct LlamaToolCallParserTests {
-
-    @Test
-    func parseEmptyResponse() async throws {
-        let result = LlamaToolCallParser.parseToolCalls(from: "", format: COMMON_CHAT_FORMAT_HERMES_2_PRO)
-        #expect(result == nil)
-    }
-    
-    @Test
-    func parseNoToolCalls() async throws {
-        let response = "This is just a regular response without any tool calls."
-        let result = LlamaToolCallParser.parseToolCalls(from: response, format: COMMON_CHAT_FORMAT_HERMES_2_PRO)
-        #expect(result == nil)
-    }
-    
-    @Test
-    func parseGenericJSONToolCall() async throws {
-        // Generic format expects {"tool_call": {"name": "...", "arguments": "..."}}
-        let response = """
-        I'll help you with that. Let me call a function to get the weather.
-        
-        {"tool_call": {"name": "get_weather", "arguments": {"location": "New York", "unit": "celsius"}}}
-        
-        Here's the weather information.
-        """
-        
-        let result = LlamaToolCallParser.parseToolCalls(from: response, format: COMMON_CHAT_FORMAT_HERMES_2_PRO)
-        #expect(result != nil)
-        #expect(result?.count == 1)
-        
-        if let toolCall = result?.first {
-            #expect(toolCall.name == "get_weather")
-            #expect(toolCall.arguments.contains("New York"))
-            #expect(toolCall.arguments.contains("celsius"))
-        }
-    }
-    
-    @Test
-    func parseHermesFormatToolCall() async throws {
-        let response = """
-        I'll search for that information.
-        
-        <tool_call>
-        {"name": "search", "arguments": {"query": "Swift programming"}}
-        </tool_call>
-        
-        Found some results for you.
-        """
-        
-        let result = LlamaToolCallParser.parseToolCalls(from: response, format: COMMON_CHAT_FORMAT_HERMES_2_PRO)
-        #expect(result != nil)
-        #expect(result?.count == 1)
-        
-        if let toolCall = result?.first {
-            #expect(toolCall.name == "search")
-            #expect(toolCall.arguments.contains("Swift programming"))
-        }
-    }
-    
-    @Test
-    func parseMultipleToolCalls() async throws {
-        // Currently the parser only captures the first tool call when multiple are present
-        // This is a limitation of the current llama.cpp integration
-        let response = """
-        I'll need to call several functions to help you.
-        
-        <tool_call>
-        {"name": "function1", "arguments": {"param": "value1"}}
-        </tool_call>
-        
-        <tool_call>
-        {"name": "function2", "arguments": {"param": "value2"}}
-        </tool_call>
-        
-        Both functions have been called.
-        """
-        
-        let result = LlamaToolCallParser.parseToolCalls(from: response, format: COMMON_CHAT_FORMAT_HERMES_2_PRO)
-        #expect(result != nil, "Parser should return a result for valid tool calls")
-        #expect(result?.count == 2, "Currently only first tool call is parsed (known limitation)")
-        
-        if let toolCall = result?.first {
-            #expect(toolCall.name == "function1", "First tool call should be function1")
-            #expect(toolCall.arguments.contains("value1"), "Arguments should contain value1")
-            
-            // Verify the arguments can be parsed as JSON
-            let data = toolCall.arguments.data(using: .utf8)!
-            let parsed = try JSONSerialization.jsonObject(with: data) as? [String: Any]
-            #expect(parsed?["param"] as? String == "value1", "Parsed arguments should have correct value")
-        } else {
-            Issue.record("Expected at least 1 tool call but got \(result?.count ?? 0)")
-        }
-    }
-    
-    @Test
-    func parseToolCallWithID() async throws {
-        let response = """
-        <tool_call>
-        {"name": "my_function", "id": "call_123", "arguments": {"test": "data"}}
-        </tool_call>
-        """
-        
-        let result = LlamaToolCallParser.parseToolCalls(from: response, format: COMMON_CHAT_FORMAT_HERMES_2_PRO)
-        #expect(result != nil)
-        #expect(result?.count == 1)
-        
-        if let toolCall = result?.first {
-            #expect(toolCall.id == "call_123")
-            #expect(toolCall.name == "my_function")
-            #expect(toolCall.arguments.contains("test"))
-        }
-    }
-    
-    @Test
-    func parseToolCallWithoutID() async throws {
-        let response = """
-        <tool_call>
-        {"name": "my_function", "arguments": {"test": "data"}}
-        </tool_call>
-        """
-        
-        let result = LlamaToolCallParser.parseToolCalls(from: response, format: COMMON_CHAT_FORMAT_HERMES_2_PRO)
-        #expect(result != nil, "Parser should handle tool calls without ID")
-        #expect(result?.count == 1, "Should parse exactly one tool call")
-        
-        if let toolCall = result?.first {
-            // Should generate a UUID when no ID is provided
-            #expect(!toolCall.id.isEmpty, "Auto-generated ID should not be empty")
-            #expect(toolCall.id.count >= 36, "Auto-generated ID should be UUID-like") // UUID format
-            #expect(toolCall.name == "my_function", "Tool name should match")
-            #expect(toolCall.arguments.contains("test"), "Arguments should contain test data")
-            
-            // Verify UUID format (basic check)
-            let uuidPattern = "^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$"
-            let uuidRegex = try? NSRegularExpression(pattern: uuidPattern)
-            let matches = uuidRegex?.matches(in: toolCall.id, range: NSRange(location: 0, length: toolCall.id.count))
-            #expect((matches?.count ?? 0) > 0 || toolCall.id.count >= 8, "ID should be in valid format")
-        }
-    }
-}
diff --git a/Tests/LocalLLMClientLlamaTests/LocalLLMClientLlamaToolTests.swift b/Tests/LocalLLMClientLlamaTests/LocalLLMClientLlamaToolTests.swift
index edb4f3f..3b48856 100644
--- a/Tests/LocalLLMClientLlamaTests/LocalLLMClientLlamaToolTests.swift
+++ b/Tests/LocalLLMClientLlamaTests/LocalLLMClientLlamaToolTests.swift
@@ -17,416 +17,114 @@ extension ModelTests {
 }
 
 extension ModelTests.LocalLLMClientLlamaToolTests {
-    // Note: LlamaClient tool calling tests are focused on Llama-specific features
-    // Full integration tests would require model download which is skipped in CI
-    
-    @Test
-    func llamaSpecificChatFormatSupport() async throws {
-        // Test Llama-specific chat formats that support tool calling
-        let toolSupportingFormats = [
-            (COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS, "llama3_builtin"),
-            (COMMON_CHAT_FORMAT_FIREFUNCTION_V2, "firefunction_v2"),
-            (COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2, "functionary_v3.2"),
-            (COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1, "functionary_v3.1_llama3.1"),
-            (COMMON_CHAT_FORMAT_HERMES_2_PRO, "hermes_2_pro")
-        ]
-        
-        for (format, name) in toolSupportingFormats {
-            // Each format has different tool call syntax
-            let testResponse = "<tool_call>{\"name\": \"test_tool\", \"arguments\": {}}</tool_call>"
-            let calls = LlamaToolCallParser.parseToolCalls(from: testResponse, format: format)
-            
-            // Some formats may not support this syntax
-            if calls != nil && !calls!.isEmpty {
-                #expect(calls?.first?.name == "test_tool", "Format \(name) should parse tool calls")
-            }
-        }
-    }
-    
-    @Test
-    func llamaSpecificToolParsing() async throws {
-        // Test Llama-specific tool parsing logic
-        // Test parsing tool calls from Llama format
-        let llamaResponse = """
-        I'll help you with that. <tool_call>
-        {"name": "get_weather", "arguments": {"location": "Tokyo"}}
-        </tool_call>
-        """
-        
-        // Try different formats that support tool calling
-        var toolCalls: [LLMToolCall]? = nil
-        
-        // Try formats that might support tool calling
-        let formatsToTry = [
-            COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
-            COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
-            COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
-            COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1
-        ]
-        
-        for format in formatsToTry {
-            toolCalls = LlamaToolCallParser.parseToolCalls(from: llamaResponse, format: format)
-            if toolCalls != nil && !toolCalls!.isEmpty {
-                break
-            }
-        }
-        
-        // If no format worked, skip the test
-        if toolCalls == nil || toolCalls!.isEmpty {
-            // This test requires a specific chat format that supports tool calling
-            // Skip if the format is not available
-            return
-        }
-        #expect(toolCalls?.count == 1)
-        #expect(toolCalls?.first?.name == "get_weather")
+    private func makeToolClient() async throws -> LlamaClient {
+        try await LocalLLMClient.llama(
+            testType: .tool,
+            // Qwen2.5 uses <tool_call> tags; tools must be declared so the PEG parser
+            // includes the tool-call grammar branches.
+            // These tools come from LocalLLMClientTestUtilities (WeatherTool, CalculatorTool).
+        )
     }
-    
-    @Test
-    func llamaChunkedToolCallParsing() async throws {
-        // Test parsing chunked tool calls
-        // Test parsing chunked tool calls
-        let chunks = [
-            "I'll check the weather. <tool_",
-            "call>\n{\"name\": \"get",
-            "_weather\", \"arguments\": {",
-            "\"location\": \"Tokyo\"}}",
-            "\n</tool_call>"
-        ]
-        
-        var fullResponse = ""
-        var parsedCalls: [LLMToolCall] = []
-        
-        // Try formats that support tool calling
-        let formatsToTry = [
-            COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
-            COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
-            COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
-            COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1
-        ]
-        
-        for chunk in chunks {
-            fullResponse += chunk
-            
-            // Try different formats
-            for format in formatsToTry {
-                parsedCalls = LlamaToolCallParser.parseToolCalls(from: fullResponse, format: format) ?? []
-                if !parsedCalls.isEmpty {
-                    break
-                }
-            }
-        }
-        
-        // If no format worked, skip the test
-        if parsedCalls.isEmpty {
-            // This test requires a specific chat format that supports tool calling
-            // Skip if the format is not available
-            return
-        }
-        
-        #expect(parsedCalls.count == 1)
-        #expect(parsedCalls.first?.name == "get_weather")
-    }
-    
-    @Test
-    func llamaModelCapabilityCheck() async throws {
-        // Test model capability detection based on model name patterns
-        let toolSupportingModels = [
-            "qwen2.5-1.5b-instruct-q5_k_m.gguf",
-            "hermes-2-pro",
-            "functionary",
-            "firefunction"
-        ]
-        
-        let nonToolModels = [
-            "llama-2-7b.gguf",
-            "mistral-7b.gguf"
-        ]
-        
-        // Check tool-supporting models
-        for modelName in toolSupportingModels {
-            let supportsTools = modelName.contains("qwen2") || 
-                               modelName.contains("hermes") || 
-                               modelName.contains("functionary") || 
-                               modelName.contains("firefunction")
-            #expect(supportsTools == true)
-        }
-        
-        // Check non-tool models
-        for modelName in nonToolModels {
-            let supportsTools = modelName.contains("qwen2") || 
-                               modelName.contains("hermes") || 
-                               modelName.contains("functionary") || 
-                               modelName.contains("firefunction")
-            #expect(supportsTools == false)
-        }
+
+    private func buildChatParams(tools: [any LLMTool]) async throws -> (LlamaClient, UnsafeMutablePointer<llm_chat_params>?) {
+        let client = try await LocalLLMClient.llama(tools: tools, testType: .tool)
+        let wrapped = tools.map { AnyLLMTool($0) }
+        let params = client._context.model.buildChatParams(tools: wrapped)
+        return (client, params)
     }
-    
+
     @Test
-    func llamaToolCallFormat() async throws {
-        // Test multiple tool calls
-        let multiToolResponse = """
-        I'll help you with both tasks.
+    func parseToolCallFromHermesStyleResponse() async throws {
+        let (_, chatParams) = try await buildChatParams(tools: [WeatherTool()])
+        defer { if let chatParams { free_chat_params(chatParams) } }
+
+        let response = """
         <tool_call>
         {"name": "get_weather", "arguments": {"location": "Tokyo", "unit": "celsius"}}
         </tool_call>
-        <tool_call>
-        {"name": "calculator", "arguments": {"expression": "2 + 2"}}
-        </tool_call>
         """
-        
-        // Try formats that support tool calling
-        let formatsToTry = [
-            COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
-            COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
-            COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
-            COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1
-        ]
-        
-        var toolCalls: [LLMToolCall] = []
-        for format in formatsToTry {
-            toolCalls = LlamaToolCallParser.parseToolCalls(from: multiToolResponse, format: format) ?? []
-            if !toolCalls.isEmpty {
-                break
-            }
-        }
-        
-        // If no format worked, skip the test
-        if toolCalls.isEmpty {
-            return
-        }
-        
-        #expect(toolCalls.count == 2)
-        #expect(toolCalls[0].name == "get_weather")
-        #expect(toolCalls[1].name == "calculator")
+
+        let calls = LlamaToolCallParser.parseToolCalls(from: response, chatParams: chatParams)
+        try #require(calls != nil, "Expected tool calls to be extracted from a well-formed response")
+        #expect(calls?.count == 1)
+        #expect(calls?.first?.name == "get_weather")
+        #expect(calls?.first?.arguments.contains("Tokyo") == true)
     }
-    
+
     @Test
-    func llamaInvalidToolCallParsing() async throws {
-        // Test invalid JSON in tool call
-        let invalidResponse = """
+    func parseToolCallWithCalculatorTool() async throws {
+        let (_, chatParams) = try await buildChatParams(tools: [WeatherTool(), CalculatorTool()])
+        defer { if let chatParams { free_chat_params(chatParams) } }
+
+        let response = """
         <tool_call>
-        {"name": "test", invalid json here}
+        {"name": "calculate", "arguments": {"expression": "15 * 4"}}
         </tool_call>
         """
-        
-        // For invalid JSON, all formats should return empty
-        let formatsToTry = [
-            COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
-            COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
-            COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
-            COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
-            COMMON_CHAT_FORMAT_GENERIC
-        ]
-        
-        for format in formatsToTry {
-            let toolCalls = LlamaToolCallParser.parseToolCalls(from: invalidResponse, format: format) ?? []
-            #expect(toolCalls.isEmpty) // Should gracefully handle invalid JSON
-        }
+
+        let calls = LlamaToolCallParser.parseToolCalls(from: response, chatParams: chatParams)
+        try #require(calls != nil)
+        #expect(calls?.first?.name == "calculate")
+        #expect(calls?.first?.arguments.contains("15 * 4") == true)
+    }
+
+    @Test
+    func parseResponseWithoutToolCalls() async throws {
+        let (_, chatParams) = try await buildChatParams(tools: [WeatherTool()])
+        defer { if let chatParams { free_chat_params(chatParams) } }
+
+        let calls = LlamaToolCallParser.parseToolCalls(
+            from: "This is a plain response without any tool calls.",
+            chatParams: chatParams
+        )
+        #expect(calls == nil)
     }
-    
+
     @Test
-    func llamaToolArgumentExtraction() async throws {
-        let response = """
-        <tool_call>
-        {
-            "name": "complex_tool",
-            "arguments": {
-                "string_arg": "test",
-                "number_arg": 42,
-                "bool_arg": true,
-                "array_arg": ["a", "b", "c"],
-                "nested_arg": {
-                    "field1": "value1",
-                    "field2": 123
-                }
-            }
-        }
-        </tool_call>
-        """
-        
-        // Try formats that support tool calling
-        let formatsToTry = [
-            COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
-            COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
-            COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
-            COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1
-        ]
-        
-        var toolCalls: [LLMToolCall] = []
-        for format in formatsToTry {
-            toolCalls = LlamaToolCallParser.parseToolCalls(from: response, format: format) ?? []
-            if !toolCalls.isEmpty {
-                break
-            }
-        }
-        
-        // If no format worked, skip the test
-        if toolCalls.isEmpty {
-            return
-        }
-        
-        #expect(toolCalls.count == 1)
-        
-        let call = toolCalls[0]
-        #expect(call.name == "complex_tool")
-        
-        // Verify JSON string contains all arguments
-        #expect(call.arguments.contains("string_arg"))
-        #expect(call.arguments.contains("number_arg"))
-        #expect(call.arguments.contains("bool_arg"))
-        #expect(call.arguments.contains("array_arg"))
-        #expect(call.arguments.contains("nested_arg"))
+    func parseEmptyResponse() async throws {
+        let (_, chatParams) = try await buildChatParams(tools: [WeatherTool()])
+        defer { if let chatParams { free_chat_params(chatParams) } }
+
+        let calls = LlamaToolCallParser.parseToolCalls(from: "", chatParams: chatParams)
+        #expect(calls == nil)
     }
-    
+
     @Test
-    func llamaToolResponseCleaning() async throws {
-        // Test that parser extracts clean text without tool calls
-        let responseWithTools = """
-        Here's the weather information:
+    func parseToolCallAssignsIDWhenMissing() async throws {
+        let (_, chatParams) = try await buildChatParams(tools: [WeatherTool()])
+        defer { if let chatParams { free_chat_params(chatParams) } }
+
+        let response = """
         <tool_call>
-        {"name": "get_weather", "arguments": {"location": "Tokyo"}}
+        {"name": "get_weather", "arguments": {"location": "Tokyo", "unit": "celsius"}}
         </tool_call>
-        The weather in Tokyo is sunny.
         """
-        
-        // Try formats that support tool calling
-        let formatsToTry = [
-            COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
-            COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
-            COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
-            COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1
-        ]
-        
-        var toolCalls: [LLMToolCall] = []
-        for format in formatsToTry {
-            toolCalls = LlamaToolCallParser.parseToolCalls(from: responseWithTools, format: format) ?? []
-            if !toolCalls.isEmpty {
-                break
-            }
-        }
-        
-        // If no format worked, skip the test
-        if toolCalls.isEmpty {
-            return
-        }
-        
-        #expect(toolCalls.count == 1)
-        
-        // The parser should extract tool calls but not modify the original text
-        // That's handled by other components
+
+        let calls = LlamaToolCallParser.parseToolCalls(from: response, chatParams: chatParams)
+        try #require(calls?.first != nil)
+        #expect(!calls!.first!.id.isEmpty, "An auto-generated UUID should be assigned when the model omits an id")
     }
-    
+
     @Test
-    func llamaStreamingToolCallParsing() async throws {
-        // Test parsing tool calls from streaming responses
-        actor StreamingParser {
-            private var buffer = ""
-            private var lastParsedCalls: [LLMToolCall] = []
-            
-            func appendChunk(_ chunk: String) -> [LLMToolCall]? {
-                buffer += chunk
-                
-                // Try to parse with different formats
-                let formats = [
-                    COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
-                    COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
-                    COMMON_CHAT_FORMAT_HERMES_2_PRO
-                ]
-                
-                for format in formats {
-                    if let calls = LlamaToolCallParser.parseToolCalls(from: buffer, format: format),
-                       !calls.isEmpty {
-                        // Check if we have new complete tool calls
-                        if calls.count > lastParsedCalls.count {
-                            lastParsedCalls = calls
-                            return calls
-                        }
-                    }
-                }
-                
-                return nil
-            }
-        }
-        
-        let parser = StreamingParser()
-        
-        // Simulate streaming chunks
-        let chunks = [
-            "I'll help you check the weather. ",
-            "<tool_call>\n{",
-            "\"name\": \"get_weather\",",
-            " \"arguments\": {",
-            "\"location\": \"Paris\",",
-            " \"unit\": \"celsius\"",
-            "}}\n</tool_call>"
-        ]
-        
-        var foundToolCall = false
-        for chunk in chunks {
-            if let calls = await parser.appendChunk(chunk), !calls.isEmpty {
-                foundToolCall = true
-                #expect(calls.first?.name == "get_weather")
-                #expect(calls.first?.arguments.contains("Paris") == true)
-                break
-            }
-        }
-        
-        #expect(foundToolCall, "Should have found tool call in streaming chunks")
+    func parseToolCallsReturnsNilWhenChatParamsIsNil() {
+        // Guard clause: a missing chatParams pointer must not crash and must return nil.
+        let calls = LlamaToolCallParser.parseToolCalls(
+            from: "<tool_call>{\"name\":\"x\",\"arguments\":{}}</tool_call>",
+            chatParams: nil
+        )
+        #expect(calls == nil)
     }
-    
+
     @Test
-    func llamaMultipleSequentialToolCalls() async throws {
-        // Test handling multiple tool calls in sequence
-        // Note: Current llama.cpp implementation only supports parsing the first tool call
-        
-        // Test with HERMES format which is known to support <tool_call> tags
-        let hermesResponse = """
-        Let me help you with multiple tasks.
-        
-        <tool_call>
-        {"name": "get_weather", "arguments": {"location": "London"}}
-        </tool_call>
-        
-        <tool_call>
-        {"name": "calculate", "arguments": {"expression": "15 * 4"}}
-        </tool_call>
-        
-        <tool_call>
-        {"name": "search", "arguments": {"query": "Swift programming"}}
-        </tool_call>
-        """
-        
-        // Parse with HERMES format
-        let hermesCalls = LlamaToolCallParser.parseToolCalls(from: hermesResponse, format: COMMON_CHAT_FORMAT_HERMES_2_PRO)
-        
-        // Verify at least one tool call is parsed (current limitation: only first is parsed)
-        #expect(hermesCalls != nil, "HERMES format should return parsed tool calls")
-        #expect(hermesCalls?.count ?? 0 >= 1, "Should parse at least one tool call")
-        
-        if let firstCall = hermesCalls?.first {
-            #expect(firstCall.name == "get_weather", "First tool call should be get_weather")
-            #expect(firstCall.arguments.contains("London"), "Arguments should contain London")
-        }
-        
-        // Test with generic JSON format for comparison
-        let genericResponse = """
-        I'll help you with that.
-        
-        {"tool_call": {"name": "get_weather", "arguments": {"location": "Paris", "unit": "celsius"}}}
-        """
-        
-        // Note: Generic format expects different structure
-        let genericCalls = LlamaToolCallParser.parseToolCalls(from: genericResponse, format: COMMON_CHAT_FORMAT_GENERIC)
-        
-        // Count successful formats
-        var successfulFormats = 0
-        if (hermesCalls?.count ?? 0) > 0 { successfulFormats += 1 }
-        if (genericCalls?.count ?? 0) > 0 { successfulFormats += 1 }
-        
-        // At least one format should work
-        #expect(successfulFormats >= 1, "At least one format should successfully parse tool calls")
-        
-        // Document current limitation
-        print("Note: Current implementation only parses the first tool call. Found \(hermesCalls?.count ?? 0) tool calls with HERMES format.")
+    func chatFormatIsReportedForToolClient() async throws {
+        let client = try await LocalLLMClient.llama(tools: [WeatherTool()], testType: .tool)
+        let format = client.chatFormat
+        let validFormats: [common_chat_format] = [
+            COMMON_CHAT_FORMAT_CONTENT_ONLY,
+            COMMON_CHAT_FORMAT_PEG_SIMPLE,
+            COMMON_CHAT_FORMAT_PEG_NATIVE,
+            COMMON_CHAT_FORMAT_PEG_GEMMA4,
+        ]
+        #expect(validFormats.contains(format), "Unexpected chat format: \(format)")
     }
 }
diff --git a/Tests/LocalLLMClientLlamaTests/MessageProcessorTests.swift b/Tests/LocalLLMClientLlamaTests/MessageProcessorTests.swift
index ab046c4..ff4033a 100644
--- a/Tests/LocalLLMClientLlamaTests/MessageProcessorTests.swift
+++ b/Tests/LocalLLMClientLlamaTests/MessageProcessorTests.swift
@@ -96,6 +96,22 @@ struct MessageProcessorTests {
         #expect(chunks == [.text("[INST] \(userMarker) [/INST]\(assistantMarker)eos_token")])
     }
 
+    @Test
+    func gemma_4() async throws {
+        // Gemma 4 uses <|turn>...<turn|> framing with <|image|> for multimodal content.
+        // Real template: https://huggingface.co/google/gemma-4-E4B-it/raw/main/chat_template.jinja (≈16 KB).
+        // Use a minimal variant here so the test does not depend on that specific file.
+        let template = #"{% for message in messages %}<|turn>{{ message['role'] }} {%- if message['content'] is string -%}{{ message['content'] }}{%- else -%}{%- for item in message['content'] -%}{%- if item['type'] == 'image' -%}<|image|>{%- elif item['type'] == 'text' -%}{{ item['text'] }}{%- endif -%}{%- endfor -%}{%- endif -%}<turn|>{% endfor %}{% if add_generation_prompt %}<|turn>model {% endif %}"#
+        let autoProcessor = MessageProcessorFactory.createAutoProcessor(chatTemplate: template)
+        let (rendered, chunks) = try validate(processor: autoProcessor, chatTemplate: template)
+        #expect(rendered.contains("<|turn>") && rendered.contains("<turn|>"))
+        // Image content must be split out as a dedicated `.image` chunk via the `<|image|>` pattern.
+        let containsImageChunk = chunks.contains { chunk in
+            if case .image = chunk { return true } else { return false }
+        }
+        #expect(containsImageChunk, "Gemma 4 auto-detection should pick the processor whose chunk extractor matches <|image|>")
+    }
+
     @Test
     func autoDetection() async throws {
         // Test that auto-detection works correctly for different templates
diff --git a/Tests/LocalLLMClientLlamaTests/ModelTests.swift b/Tests/LocalLLMClientLlamaTests/ModelTests.swift
index a3f6f10..20552cd 100644
--- a/Tests/LocalLLMClientLlamaTests/ModelTests.swift
+++ b/Tests/LocalLLMClientLlamaTests/ModelTests.swift
@@ -53,9 +53,9 @@ extension LocalLLMClient {
                 )
             case .normal:
                 return (
-                    id: "lmstudio-community/gemma-3-4B-it-qat-GGUF",
-                    model: "gemma-3-4B-it-QAT-Q4_0.gguf",
-                    clip: "mmproj-model-f16.gguf"
+                    id: "lmstudio-community/gemma-4-E4B-it-GGUF",
+                    model: "gemma-4-E4B-it-Q4_K_M.gguf",
+                    clip: "mmproj-gemma-4-E4B-it-BF16.gguf"
                 )
             }
         }
diff --git a/Tests/LocalLLMClientUtilityTests/FilesMetadataTests.swift b/Tests/LocalLLMClientUtilityTests/FilesMetadataTests.swift
index f4bbdff..92f3452 100644
--- a/Tests/LocalLLMClientUtilityTests/FilesMetadataTests.swift
+++ b/Tests/LocalLLMClientUtilityTests/FilesMetadataTests.swift
@@ -39,8 +39,9 @@ struct FilesMetadataTests {
     func testHuggingFaceGlobsMLXDefault() {
         let mlxGlobs = Globs.mlx
 
-        #expect(mlxGlobs.rawValue.count == 2)
+        #expect(mlxGlobs.rawValue.count == 3)
         #expect(mlxGlobs.rawValue.contains("*.safetensors"))
         #expect(mlxGlobs.rawValue.contains("*.json"))
+        #expect(mlxGlobs.rawValue.contains("*.jinja"))
     }
 }
diff --git a/scripts/update_dependencies.sh b/scripts/update_dependencies.sh
index dfca73d..2330dfd 100755
--- a/scripts/update_dependencies.sh
+++ b/scripts/update_dependencies.sh
@@ -66,4 +66,29 @@ echo "Package.swift has been updated to use llama.cpp version $TARGET_TAG"
 echo "Updating git submodules..."
 git fetch --tags
 git -C "$PROJECT_ROOT/Sources/LocalLLMClientLlamaC/exclude/llama.cpp" checkout tags/$TARGET_TAG
-echo "All submodules have been updated."
\ No newline at end of file
+echo "All submodules have been updated."
+
+# Verify that every symlink in LocalLLMClientLlamaC still resolves against the new submodule.
+echo
+echo "Verifying symlinks under LocalLLMClientLlamaC/..."
+LLAMAC_DIR="$PROJECT_ROOT/Sources/LocalLLMClientLlamaC"
+BROKEN_SYMLINKS_FILE="$(mktemp)"
+find "$LLAMAC_DIR" -path "$LLAMAC_DIR/exclude" -prune -o -type l -print | while read -r link; do
+    if [ ! -e "$link" ]; then
+        target=$(readlink "$link")
+        printf '  %s -> %s\n' "${link#"$PROJECT_ROOT/"}" "$target" >> "$BROKEN_SYMLINKS_FILE"
+    fi
+done
+
+if [ -s "$BROKEN_SYMLINKS_FILE" ]; then
+    echo "WARNING: the following symlinks no longer resolve after updating to $TARGET_TAG:"
+    cat "$BROKEN_SYMLINKS_FILE"
+    rm -f "$BROKEN_SYMLINKS_FILE"
+    echo
+    echo "Upstream probably renamed or moved these files. Inspect the new tree under"
+    echo "  Sources/LocalLLMClientLlamaC/exclude/llama.cpp/"
+    echo "and update the symlinks (and any compile/header references) before committing."
+    exit 1
+fi
+rm -f "$BROKEN_SYMLINKS_FILE"
+echo "All symlinks resolve."
\ No newline at end of file

From 4f16951736259a5d0f19ea4f17d8942de19572ed Mon Sep 17 00:00:00 2001
From: "tattn (Tatsuya Tanaka)" <contact@tattn.dev>
Date: Mon, 20 Apr 2026 20:33:01 +0900
Subject: [PATCH 2/3] Fix CI

---
 .github/workflows/test.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 2e11da1..ad9e37e 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -58,7 +58,7 @@ jobs:
           xcodebuild -downloadComponent MetalToolchain
 
       - name: Run ${{ matrix.test-type }} tests with Xcode 26
-        run: TEST_RUNNER_GITHUB_ACTIONS_TEST="${{ matrix.test-type }}" xcodebuild test -scheme LocalLLMClient-Package -destination 'platform=macOS'
+        run: TEST_RUNNER_GITHUB_ACTIONS_TEST="${{ matrix.test-type }}" xcodebuild test -skipMacroValidation -scheme LocalLLMClient-Package -destination 'platform=macOS'
 
       - name: Upload test results
         if: failure()
@@ -84,7 +84,7 @@ jobs:
         working-directory: Example
         run: |
           xcodebuild -downloadPlatform iOS
-          xcodebuild build -project LocalLLMClientExample.xcodeproj -scheme LocalLLMClientExample -destination 'platform=macOS' CODE_SIGN_IDENTITY="-"
+          xcodebuild build -skipMacroValidation -project LocalLLMClientExample.xcodeproj -scheme LocalLLMClientExample -destination 'platform=macOS' CODE_SIGN_IDENTITY="-"
 
   build-example-ios:
     runs-on: macos-26
@@ -100,7 +100,7 @@ jobs:
         working-directory: Example
         run: |
           xcodebuild -downloadPlatform iOS
-          xcodebuild build -project LocalLLMClientExample.xcodeproj -scheme LocalLLMClientExample -destination 'platform=iOS Simulator,name=iPhone 17 Pro,OS=26.4' CODE_SIGN_IDENTITY="-"
+          xcodebuild build -skipMacroValidation -project LocalLLMClientExample.xcodeproj -scheme LocalLLMClientExample -destination 'platform=iOS Simulator,name=iPhone 17 Pro,OS=26.4' CODE_SIGN_IDENTITY="-"
 
   test-ubuntu-x86_64:
     runs-on: ubuntu-latest

From 758a62e9403eb123b47278d3e435dac3d5bca703 Mon Sep 17 00:00:00 2001
From: "tattn (Tatsuya Tanaka)" <contact@tattn.dev>
Date: Tue, 21 Apr 2026 00:18:13 +0900
Subject: [PATCH 3/3] Fix CI

---
 .../LLMSessionLlamaTests.swift                |   2 +-
 .../LocalLLMClientLlamaToolTests.swift        | 160 +++++++++++++-----
 .../LocalLLMClientLlamaTests/ModelTests.swift |   8 +-
 3 files changed, 126 insertions(+), 44 deletions(-)

diff --git a/Tests/LocalLLMClientLlamaTests/LLMSessionLlamaTests.swift b/Tests/LocalLLMClientLlamaTests/LLMSessionLlamaTests.swift
index 467a701..a667835 100644
--- a/Tests/LocalLLMClientLlamaTests/LLMSessionLlamaTests.swift
+++ b/Tests/LocalLLMClientLlamaTests/LLMSessionLlamaTests.swift
@@ -16,7 +16,7 @@ extension ModelTests {
         
         private static func makeToolModel(size: LocalLLMClient.ModelSize = .default) -> LLMSession.DownloadModel {
             let info = LocalLLMClient.modelInfo(for: .tool, modelSize: size)
-            return .llama(id: info.id, model: info.model, mmproj: info.clip, parameter: .init(context: 1800))
+            return .llama(id: info.id, model: info.model, mmproj: info.clip, parameter: .init(context: 2500))
         }
         
         @Test
diff --git a/Tests/LocalLLMClientLlamaTests/LocalLLMClientLlamaToolTests.swift b/Tests/LocalLLMClientLlamaTests/LocalLLMClientLlamaToolTests.swift
index 3b48856..4bf43e4 100644
--- a/Tests/LocalLLMClientLlamaTests/LocalLLMClientLlamaToolTests.swift
+++ b/Tests/LocalLLMClientLlamaTests/LocalLLMClientLlamaToolTests.swift
@@ -17,15 +17,6 @@ extension ModelTests {
 }
 
 extension ModelTests.LocalLLMClientLlamaToolTests {
-    private func makeToolClient() async throws -> LlamaClient {
-        try await LocalLLMClient.llama(
-            testType: .tool,
-            // Qwen2.5 uses <tool_call> tags; tools must be declared so the PEG parser
-            // includes the tool-call grammar branches.
-            // These tools come from LocalLLMClientTestUtilities (WeatherTool, CalculatorTool).
-        )
-    }
-
     private func buildChatParams(tools: [any LLMTool]) async throws -> (LlamaClient, UnsafeMutablePointer<llm_chat_params>?) {
         let client = try await LocalLLMClient.llama(tools: tools, testType: .tool)
         let wrapped = tools.map { AnyLLMTool($0) }
@@ -33,34 +24,141 @@ extension ModelTests.LocalLLMClientLlamaToolTests {
         return (client, params)
     }
 
+    private func makeToolCallResponse(
+        name: String,
+        argumentsJSON: String,
+        client: LlamaClient
+    ) throws -> String {
+        if usesQwen35XMLToolCallSyntax(chatTemplate: client._context.model.chatTemplate) {
+            let argumentsData = try #require(argumentsJSON.data(using: .utf8))
+            let argumentsObject = try JSONSerialization.jsonObject(with: argumentsData)
+            let qwen35Parameters = try #require(renderQwen35Parameters(argumentsObject))
+            return """
+            <tool_call>
+            <function=\(name)>
+            \(qwen35Parameters)
+            </function>
+            </tool_call>
+            """
+        }
+
+        switch client.chatFormat {
+        case COMMON_CHAT_FORMAT_PEG_GEMMA4:
+            let argumentsData = try #require(argumentsJSON.data(using: .utf8))
+            let argumentsObject = try JSONSerialization.jsonObject(with: argumentsData)
+            let gemmaArguments = try #require(renderGemma4Arguments(argumentsObject))
+            return "<|tool_call>call:\(name)\(gemmaArguments)<tool_call|>"
+        default:
+            return """
+            <tool_call>
+            {"name": "\(name)", "arguments": \(argumentsJSON)}
+            </tool_call>
+            """
+        }
+    }
+
+    private func usesQwen35XMLToolCallSyntax(chatTemplate: String) -> Bool {
+        chatTemplate.contains("<function=") && chatTemplate.contains("<parameter=")
+    }
+
+    private func renderQwen35Parameters(_ value: Any) -> String? {
+        guard let object = value as? [String: Any] else {
+            return nil
+        }
+
+        return object.keys.sorted().map { key in
+            let value = object[key]!
+            return """
+            <parameter=\(key)>
+            \(renderQwen35Value(value))
+            </parameter>
+            """
+        }
+        .joined(separator: "\n")
+    }
+
+    private func renderQwen35Value(_ value: Any) -> String {
+        switch value {
+        case let string as String:
+            return string
+        case let number as NSNumber:
+            if CFGetTypeID(number) == CFBooleanGetTypeID() {
+                return number.boolValue ? "true" : "false"
+            }
+            return number.stringValue
+        case _ as NSNull:
+            return "null"
+        default:
+            let data = try? JSONSerialization.data(withJSONObject: value)
+            return data.flatMap { String(data: $0, encoding: .utf8) } ?? String(describing: value)
+        }
+    }
+
+    private func renderGemma4Arguments(_ value: Any) -> String? {
+        guard let object = value as? [String: Any] else {
+            return nil
+        }
+
+        let renderedPairs = object.keys.sorted().map { key in
+            let value = object[key]!
+            return "\(key):\(renderGemma4Value(value))"
+        }
+        return "{\(renderedPairs.joined(separator: ","))}"
+    }
+
+    private func renderGemma4Value(_ value: Any) -> String {
+        switch value {
+        case let string as String:
+            return #"<|\"|>\#(string)<|\"|>"#
+        case let number as NSNumber:
+            if CFGetTypeID(number) == CFBooleanGetTypeID() {
+                return number.boolValue ? "true" : "false"
+            }
+            return number.stringValue
+        case _ as NSNull:
+            return "null"
+        case let array as [Any]:
+            return "[\(array.map(renderGemma4Value).joined(separator: ","))]"
+        case let dictionary as [String: Any]:
+            let pairs = dictionary.keys.sorted().map { key in
+                let value = dictionary[key]!
+                return "\(key):\(renderGemma4Value(value))"
+            }
+            return "{\(pairs.joined(separator: ","))}"
+        default:
+            fatalError("Unsupported Gemma 4 argument type: \(type(of: value))")
+        }
+    }
+
     @Test
-    func parseToolCallFromHermesStyleResponse() async throws {
-        let (_, chatParams) = try await buildChatParams(tools: [WeatherTool()])
+    func parseToolCallFromModelNativeResponse() async throws {
+        let (client, chatParams) = try await buildChatParams(tools: [WeatherTool()])
         defer { if let chatParams { free_chat_params(chatParams) } }
 
-        let response = """
-        <tool_call>
-        {"name": "get_weather", "arguments": {"location": "Tokyo", "unit": "celsius"}}
-        </tool_call>
-        """
+        let response = try makeToolCallResponse(
+            name: "get_weather",
+            argumentsJSON: #"{"location":"Tokyo","unit":"celsius"}"#,
+            client: client
+        )
 
         let calls = LlamaToolCallParser.parseToolCalls(from: response, chatParams: chatParams)
         try #require(calls != nil, "Expected tool calls to be extracted from a well-formed response")
         #expect(calls?.count == 1)
         #expect(calls?.first?.name == "get_weather")
+        #expect(calls?.first?.id.isEmpty == false, "An auto-generated UUID should be assigned when the model omits an id")
         #expect(calls?.first?.arguments.contains("Tokyo") == true)
     }
 
     @Test
-    func parseToolCallWithCalculatorTool() async throws {
-        let (_, chatParams) = try await buildChatParams(tools: [WeatherTool(), CalculatorTool()])
+    func parseToolCallWithDeclaredToolSet() async throws {
+        let (client, chatParams) = try await buildChatParams(tools: [WeatherTool(), CalculatorTool()])
         defer { if let chatParams { free_chat_params(chatParams) } }
 
-        let response = """
-        <tool_call>
-        {"name": "calculate", "arguments": {"expression": "15 * 4"}}
-        </tool_call>
-        """
+        let response = try makeToolCallResponse(
+            name: "calculate",
+            argumentsJSON: #"{"expression":"15 * 4"}"#,
+            client: client
+        )
 
         let calls = LlamaToolCallParser.parseToolCalls(from: response, chatParams: chatParams)
         try #require(calls != nil)
@@ -89,22 +187,6 @@ extension ModelTests.LocalLLMClientLlamaToolTests {
         #expect(calls == nil)
     }
 
-    @Test
-    func parseToolCallAssignsIDWhenMissing() async throws {
-        let (_, chatParams) = try await buildChatParams(tools: [WeatherTool()])
-        defer { if let chatParams { free_chat_params(chatParams) } }
-
-        let response = """
-        <tool_call>
-        {"name": "get_weather", "arguments": {"location": "Tokyo", "unit": "celsius"}}
-        </tool_call>
-        """
-
-        let calls = LlamaToolCallParser.parseToolCalls(from: response, chatParams: chatParams)
-        try #require(calls?.first != nil)
-        #expect(!calls!.first!.id.isEmpty, "An auto-generated UUID should be assigned when the model omits an id")
-    }
-
     @Test
     func parseToolCallsReturnsNilWhenChatParamsIsNil() {
         // Guard clause: a missing chatParams pointer must not crash and must return nil.
diff --git a/Tests/LocalLLMClientLlamaTests/ModelTests.swift b/Tests/LocalLLMClientLlamaTests/ModelTests.swift
index 20552cd..60bb1f3 100644
--- a/Tests/LocalLLMClientLlamaTests/ModelTests.swift
+++ b/Tests/LocalLLMClientLlamaTests/ModelTests.swift
@@ -32,14 +32,14 @@ extension LocalLLMClient {
             switch size {
             case .light:
                 return (
-                    id: "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
-                    model: "qwen2.5-0.5b-instruct-q8_0.gguf",
+                    id: "unsloth/Qwen3.5-2B-GGUF",
+                    model: "Qwen3.5-2B-Q4_K_M.gguf",
                     clip: nil
                 )
             case .normal:
                 return (
-                    id: "Qwen/Qwen2.5-1.5B-Instruct-GGUF",
-                    model: "qwen2.5-1.5b-instruct-q4_k_m.gguf",
+                    id: "unsloth/Qwen3.5-2B-GGUF",
+                    model: "Qwen3.5-2B-Q4_K_M.gguf",
                     clip: nil
                 )
             }