From 4c7249e98d2c22e66cfc2fc47aa88a2c43ce99f1 Mon Sep 17 00:00:00 2001
From: baronrabban <3967525+baronrabban@users.noreply.github.com>
Date: Wed, 10 Jun 2026 18:45:52 -0400
Subject: [PATCH] Fix draft model ignoring draft_gpu_split on load

The exllamav3 backend parses the user-configured draft_gpu_split into
self.draft_gpu_split, but load_model_sync passed self.gpu_split (the main
model's split) when loading the draft model, so the draft split was
silently ignored. Use self.draft_gpu_split instead.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 backends/exllamav3/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py
index 23e90ed8..9a9e72b6 100644
--- a/backends/exllamav3/model.py
+++ b/backends/exllamav3/model.py
@@ -527,7 +527,7 @@ def load_model_sync(self, progress_callback=None):
         if self.use_draft_model:
             for value in self.draft_model.load_gen(
                 reserve_per_device=self.autosplit_reserve,
-                use_per_device=self.gpu_split,
+                use_per_device=self.draft_gpu_split,
                 callback=progress_callback,
             ):
                 if value: