v1.8.32: GPU layers rejection, VRAM undefined fix, context memory preservation, truncated tool parsing, race condition handling, rotation threshold tuning

Brendan Gray · Brendan Gray · commit b62b194487b3 · 2026-03-13T07:54:47.000-04:00
diff --git a/main/agenticChat.js b/main/agenticChat.js
@@ -933,7 +933,9 @@ function register(ctx) {
       }
 
       // ── Pre-generation context check ──
-      if (iteration > 1) {
+      // Run for ALL iterations, not just > 1, to catch critically high context on first turn
+      // This prevents stalls when context is near full from conversation history
+      {
         const preGenResult = preGenerationContextCheck({
           llmEngine, totalCtx, currentPrompt, fullResponseText,
           allToolResults, contextRotations, MAX_CONTEXT_ROTATIONS: MAX_CONTEXT_ROTATIONS,
@@ -1128,17 +1130,32 @@ function register(ctx) {
         if (isContextOverflow && contextRotations < MAX_CONTEXT_ROTATIONS) {
           if (_pendingPartialBlock) { _pendingPartialBlock = null; }
 
-          // Continuation-overflow: clear partial content and retry
+          // Continuation-overflow: preserve partial content and provide context
+          // Generic solution: works for file writing, conversations, reading, browsing, etc.
           if (continuationCount > 0 && summarizer.completedSteps.length === 0) {
-            fullResponseText = '';
+            // Preserve partial output for context instead of wiping it
+            const partialOutput = fullResponseText.slice(-Math.min(fullResponseText.length, 2000));
+            fullResponseText = '';  // Clear for rotation, but we have partialOutput for context
             continuationCount = 0;
             overflowResponseBudgetReduced = true;
             contextRotations++;
             try { await llmEngine.resetSession(true); } catch (_) {}
             sessionJustRotated = true;
+            
+            // Generate a generic summary of what was happening (uses recorded plans/actions)
+            const actionsSummary = summarizer.generateQuickSummary(mcpToolServer?._todos);
+            
+            // Build generic continuation prompt that works for ANY task type
+            const partialHint = partialOutput.trim() 
+              ? `\n\n## CONTINUE FROM HERE\n---\n${partialOutput.substring(Math.max(0, partialOutput.length - 1500))}\n---`
+              : '';
+            
             currentPrompt = {
               systemContext: buildStaticPrompt(),
-              userMessage: buildDynamicContext(Math.floor(maxPromptTokens * 0.10)) + '\n' + message,
+              userMessage: buildDynamicContext(Math.floor(maxPromptTokens * 0.10)) + 
+                (actionsSummary ? '\n\n' + actionsSummary : '') +
+                partialHint +
+                '\n\n**Context rotated. Continue the task from where you left off.**\n' + message,
             };
             continue;
           }
diff --git a/main/agenticChatHelpers.js b/main/agenticChatHelpers.js
@@ -375,8 +375,9 @@ function progressiveContextCompaction(options) {
     if (chatHistory) pruned += pruneVerboseHistory(chatHistory, 2);
   }
 
-  // Proactive rotation at 78% to prevent context overflow (was 80%)
-  const shouldRotate = pct > 0.78;
+  // Proactive rotation at 72% to prevent context stalls (lowered from 78%)
+  // This gives more headroom for large file generation before hitting overflow
+  const shouldRotate = pct > 0.72;
 
   if (pruned > 0) {
     console.log(`[Context Compaction] Phase ${pct > 0.75 ? 3 : pct > 0.60 ? 2 : 1}: compacted ${pruned} items at ${Math.round(pct * 100)}% usage`);
diff --git a/main/llmEngine.js b/main/llmEngine.js
@@ -10,7 +10,7 @@ const { detectFamily, detectParamSize } = require('./modelDetection');
 const { sanitizeResponse } = require('./sanitize');
 
 // ─── Constants ───
-const STALL_TIMEOUT_MS = 90_000;
+const STALL_TIMEOUT_MS = 45_000;
 const MAX_HISTORY_ENTRIES = 40;
 const GPU_INIT_TIMEOUT = 120_000;
 const MODEL_LOAD_TIMEOUT = 180_000;
@@ -215,18 +215,31 @@ class LLMEngine extends EventEmitter {
       const llamaCppPath = this._getNodeLlamaCppPath();
       const { getLlama, LlamaChat, InputLookupTokenPredictor } = await import(pathToFileURL(llamaCppPath).href);
 
-      // Cancel any in-flight generation
+      // Cancel any in-flight generation FIRST, then wait
       if (this.abortController) {
         this.cancelGeneration('model-switch');
       }
-      // Wait for generation to settle before disposing (prevents "Object is disposed" race)
+      // Wait for active generation to fully complete before disposing
       if (this._activeGenerationPromise) {
-        try { await this._activeGenerationPromise; } catch {}
+        try { 
+          await Promise.race([
+            this._activeGenerationPromise,
+            new Promise(r => setTimeout(r, 3000)), // Max 3s wait for stuck generation
+          ]); 
+        } catch {}
+      }
+      // Extended settle time for node-llama-cpp internal async ops
+      // (_eraseContextTokenRanges, streaming callbacks, etc.)
+      // Increased from 500ms to 1000ms to prevent "Object is disposed" race
+      await new Promise(r => setTimeout(r, 1000));
+      
+      // Wrap dispose in additional try-catch for race protection
+      try {
+        await this._dispose();
+      } catch (disposeErr) {
+        const log = require('./logger');
+        log.warn(`Dispose error (may be expected during model switch): ${disposeErr.message}`);
       }
-      // Extra settle time for node-llama-cpp internal async ops (_eraseContextTokenRanges etc.)
-      // that may still be in-flight after the generation promise resolves
-      await new Promise(r => setTimeout(r, 500));
-      await this._dispose();
 
       if (loadSignal.aborted) throw new Error('Load cancelled');
 
@@ -283,6 +296,16 @@ class LLMEngine extends EventEmitter {
             bestAutoGpuLayers = loadedModel.gpuLayers;
           }
 
+          // Reject 'cuda' mode if it loaded 0 layers despite available VRAM
+          // This forces fallback to explicit layer counts which work better on constrained VRAM
+          if (mode === 'cuda' && loadedModel.gpuLayers === 0 && gpuConfig.vramGB > 0.5) {
+            const log = require('./logger');
+            log.warn(`CUDA mode loaded 0 layers despite ${gpuConfig.vramGB.toFixed(1)}GB VRAM — trying next mode`);
+            loadedModel.dispose?.();
+            loadedModel = null;
+            continue;
+          }
+
           // Now try to create context on this model
           const ctxTimeout = mode === false ? CTX_CREATE_TIMEOUT_CPU : CTX_CREATE_TIMEOUT_GPU;
           let maxCtx = this._computeMaxContext(gpuConfig.modelSizeGB);
@@ -1109,8 +1132,28 @@ class LLMEngine extends EventEmitter {
       try { this.sequence.dispose?.(); } catch {}
       this.sequence = null;
     }
+    
+    // Try to get a new sequence, with fallback to recreate context if "No sequences left"
     if (this.context) {
-      this.sequence = this.context.getSequence();
+      try {
+        this.sequence = this.context.getSequence();
+      } catch (seqErr) {
+        const log = require('./logger');
+        log.warn(`getSequence failed: ${seqErr.message} — recreating context`);
+        
+        // Context is exhausted, recreate it
+        try { this.context.dispose?.(); } catch {}
+        this.context = await this.model.createContext({
+          contextSize: { min: 512, max: this._computeMaxContext(0) },
+          flashAttention: true,
+          ignoreMemorySafetyChecks: true,
+          failedCreationRemedy: { retries: 4, autoContextSizeShrink: 0.5 },
+        });
+        
+        if (this.context) {
+          this.sequence = this.context.getSequence();
+        }
+      }
     }
 
     if (!this.sequence || this.sequence._disposed) {
@@ -1207,7 +1250,25 @@ class LLMEngine extends EventEmitter {
       };
       return this.gpuInfo;
     } catch {
-      return this.gpuInfo; // Return cached value or null
+      // Return default values instead of null to prevent undefined UI display
+      if (!this.gpuInfo) {
+        return {
+          name: 'Unknown',
+          memoryTotal: 0,
+          memoryUsed: 0,
+          memoryFree: 0,
+          memoryTotalGB: 0,
+          memoryUsedGB: 0,
+          memoryFreeGB: 0,
+          usagePercent: 0,
+          utilization: 0,
+          temperature: 0,
+          isActive: false,
+          gpuLayers: this.modelInfo?.gpuLayers || 0,
+          backend: this.modelInfo?.gpuMode || 'unknown',
+        };
+      }
+      return this.gpuInfo; // Return cached valid value
     }
   }
 
diff --git a/main/tools/toolParser.js b/main/tools/toolParser.js
@@ -330,6 +330,67 @@ function parseToolCalls(text) {
     }
   }
 
+  // Method 1.6: Truncated tool call recovery — handles incomplete JSON from maxTokens cutoff
+  // This catches cases where Method 1.5 fails because JSON is truncated mid-content
+  if (calls.length === 0) {
+    // Check for unclosed fence with tool name but incomplete JSON
+    const fenceStart = text.search(/```(?:tool_call|tool|json)/);
+    if (fenceStart !== -1) {
+      const afterFence = text.slice(fenceStart);
+      const hasClosingFence = /```(?:tool_call|tool|json)[^\n]*\n[\s\S]*?```/.test(afterFence);
+      
+      if (!hasClosingFence) {
+        // Found unclosed fence — try to extract truncated tool call
+        const toolNameMatch = afterFence.match(/\{\s*["']?(?:tool|name)["']?\s*:\s*["']([^"']+)["']/i);
+        if (toolNameMatch) {
+          const rawToolName = toolNameMatch[1].toLowerCase().replace(/-/g, '_');
+          const toolName = TOOL_NAME_ALIASES[rawToolName] || rawToolName;
+          
+          if (VALID_TOOLS.has(toolName)) {
+            const call = { tool: toolName, params: {}, _truncated: true };
+            
+            // Extract filePath if present
+            const pathMatch = afterFence.match(/"(?:filePath|file_path|path)"\s*:\s*"([^"]+)"/i);
+            if (pathMatch) call.params.filePath = pathMatch[1];
+            
+            // For write_file/append_to_file, extract partial content
+            if (toolName === 'write_file' || toolName === 'append_to_file') {
+              const contentMatch = afterFence.match(/"content"\s*:\s*"([\s\S]*)/);
+              if (contentMatch) {
+                let content = contentMatch[1];
+                // Find the actual content by removing trailing truncated chars
+                // Look for last complete line before truncation
+                const lines = content.split('\\n');
+                if (lines.length > 1) {
+                  // Remove last potentially truncated line
+                  lines.pop();
+                  content = lines.join('\\n');
+                }
+                // Unescape JSON encoding
+                content = content
+                  .replace(/\\n/g, '\n')
+                  .replace(/\\t/g, '\t')
+                  .replace(/\\r/g, '\r')
+                  .replace(/\\"/g, '"')
+                  .replace(/\\\\/g, '\\');
+                call.params.content = content;
+              }
+            }
+            
+            // For read_file, extract lineRange if present
+            if (toolName === 'read_file') {
+              const rangeMatch = afterFence.match(/"(?:lineRange|lines)"\s*:\s*\[(\d+)\s*,\s*(\d+)\]/i);
+              if (rangeMatch) call.params.lineRange = [parseInt(rangeMatch[1]), parseInt(rangeMatch[2])];
+            }
+            
+            console.log(`[ToolParser] Recovered truncated ${toolName} call`);
+            addCall(call);
+          }
+        }
+      }
+    }
+  }
+
   if (calls.length > 0) return _postProcess(calls, text);
 
   // Method 1.8: OpenAI array format — [{"name":"...", "arguments":{...}}]