abetlen · abetlen · Jun 5, 2026 · Jun 5, 2026
diff --git a/README.md b/README.md
@@ -535,7 +535,7 @@ Below are the supported multi-modal models and their respective chat handlers (P
 | [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` |
 | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6` |
 | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` |
-| [gemma-4](https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF) | `Gemma4ChatHandler` | `gemma4` |
+| [gemma-4](https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abetlen/llama-cpp-python/blob/main/examples/colab/notebook.ipynb) | `Gemma4ChatHandler` | `gemma4` |
 | GGUF models with an mtmd projector and embedded chat template | `MTMDChatHandler` | `mtmd` |
 
 Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.

diff --git a/examples/colab/notebook.ipynb b/examples/colab/notebook.ipynb
@@ -0,0 +1,131 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 5,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "accelerator": "GPU",
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Gemma 4 12B Multimodal Chat\n",
+        "\n",
+        "Run Gemma 4 12B locally in Google Colab with the pre-built CUDA wheel for `llama-cpp-python`.\n",
+        "\n",
+        "Use a GPU runtime before running this notebook: **Runtime > Change runtime type > T4 GPU**.\n",
+        "\n",
+        "Current Colab CUDA images commonly provide CUDA 12 user-space libraries even when `nvidia-smi` reports a CUDA 13-capable driver, so this notebook installs the `cu125` wheel. If your runtime provides `libcudart.so.13`, switch the wheel index URL to `/whl/cu130`.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!pip install --no-cache-dir --upgrade --force-reinstall \\\n",
+        "  \"huggingface-hub>=0.23.0\" \\\n",
+        "  llama-cpp-python \\\n",
+        "  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu125\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from llama_cpp import Llama\n",
+        "from llama_cpp.llama_chat_format import Gemma4ChatHandler\n",
+        "\n",
+        "MODEL_REPO = \"ggml-org/gemma-4-12B-it-GGUF\"\n",
+        "MODEL_FILE = \"gemma-4-12B-it-Q4_K_M.gguf\"\n",
+        "MMPROJ_FILE = \"mmproj-gemma-4-12B-it-Q8_0.gguf\"\n",
+        "\n",
+        "chat_handler = Gemma4ChatHandler.from_pretrained(\n",
+        "    repo_id=MODEL_REPO,\n",
+        "    filename=MMPROJ_FILE,\n",
+        "    verbose=False,\n",
+        ")\n",
+        "\n",
+        "llm = Llama.from_pretrained(\n",
+        "    repo_id=MODEL_REPO,\n",
+        "    filename=MODEL_FILE,\n",
+        "    chat_handler=chat_handler,\n",
+        "    n_gpu_layers=-1,\n",
+        "    n_ctx=8192,\n",
+        "    flash_attn=True,\n",
+        "    verbose=False,\n",
+        ")\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "response = llm.create_chat_completion(\n",
+        "    messages=[\n",
+        "        {\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": \"Write the exact string `<stdio.h>` and nothing else.\",\n",
+        "        }\n",
+        "    ],\n",
+        "    max_tokens=32,\n",
+        "    temperature=0.0,\n",
+        ")\n",
+        "\n",
+        "print(response[\"choices\"][0][\"message\"][\"content\"])\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from IPython.display import Image, display\n",
+        "\n",
+        "IMAGE_URL = \"https://raw.githubusercontent.com/abetlen/llama-cpp-python/main/vendor/llama.cpp/tools/mtmd/test-1.jpeg\"\n",
+        "\n",
+        "display(Image(url=IMAGE_URL, width=320))\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "response = llm.create_chat_completion(\n",
+        "    messages=[\n",
+        "        {\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": [\n",
+        "                {\"type\": \"text\", \"text\": \"Describe this image in one concise sentence.\"},\n",
+        "                {\"type\": \"image_url\", \"image_url\": {\"url\": IMAGE_URL}},\n",
+        "            ],\n",
+        "        }\n",
+        "    ],\n",
+        "    max_tokens=128,\n",
+        "    temperature=0.2,\n",
+        ")\n",
+        "\n",
+        "print(response[\"choices\"][0][\"message\"][\"content\"])\n"
+      ]
+    }
+  ]
+}