From 4c7249e98d2c22e66cfc2fc47aa88a2c43ce99f1 Mon Sep 17 00:00:00 2001 From: baronrabban <3967525+baronrabban@users.noreply.github.com> Date: Wed, 10 Jun 2026 18:45:52 -0400 Subject: [PATCH] Fix draft model ignoring draft_gpu_split on load The exllamav3 backend parses the user-configured draft_gpu_split into self.draft_gpu_split, but load_model_sync passed self.gpu_split (the main model's split) when loading the draft model, so the draft split was silently ignored. Use self.draft_gpu_split instead. Co-Authored-By: Claude Opus 4.8 --- backends/exllamav3/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index 23e90ed8..9a9e72b6 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -527,7 +527,7 @@ def load_model_sync(self, progress_callback=None): if self.use_draft_model: for value in self.draft_model.load_gen( reserve_per_device=self.autosplit_reserve, - use_per_device=self.gpu_split, + use_per_device=self.draft_gpu_split, callback=progress_callback, ): if value: