Skip to content

Commit b62b194

Browse files
author
Brendan Gray
committed
v1.8.32: GPU layers rejection, VRAM undefined fix, context memory preservation, truncated tool parsing, race condition handling, rotation threshold tuning
1 parent e361c36 commit b62b194

4 files changed

Lines changed: 156 additions & 16 deletions

File tree

main/agenticChat.js

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -933,7 +933,9 @@ function register(ctx) {
933933
}
934934

935935
// ── Pre-generation context check ──
936-
if (iteration > 1) {
936+
// Run for ALL iterations, not just > 1, to catch critically high context on first turn
937+
// This prevents stalls when context is near full from conversation history
938+
{
937939
const preGenResult = preGenerationContextCheck({
938940
llmEngine, totalCtx, currentPrompt, fullResponseText,
939941
allToolResults, contextRotations, MAX_CONTEXT_ROTATIONS: MAX_CONTEXT_ROTATIONS,
@@ -1128,17 +1130,32 @@ function register(ctx) {
11281130
if (isContextOverflow && contextRotations < MAX_CONTEXT_ROTATIONS) {
11291131
if (_pendingPartialBlock) { _pendingPartialBlock = null; }
11301132

1131-
// Continuation-overflow: clear partial content and retry
1133+
// Continuation-overflow: preserve partial content and provide context
1134+
// Generic solution: works for file writing, conversations, reading, browsing, etc.
11321135
if (continuationCount > 0 && summarizer.completedSteps.length === 0) {
1133-
fullResponseText = '';
1136+
// Preserve partial output for context instead of wiping it
1137+
const partialOutput = fullResponseText.slice(-Math.min(fullResponseText.length, 2000));
1138+
fullResponseText = ''; // Clear for rotation, but we have partialOutput for context
11341139
continuationCount = 0;
11351140
overflowResponseBudgetReduced = true;
11361141
contextRotations++;
11371142
try { await llmEngine.resetSession(true); } catch (_) {}
11381143
sessionJustRotated = true;
1144+
1145+
// Generate a generic summary of what was happening (uses recorded plans/actions)
1146+
const actionsSummary = summarizer.generateQuickSummary(mcpToolServer?._todos);
1147+
1148+
// Build generic continuation prompt that works for ANY task type
1149+
const partialHint = partialOutput.trim()
1150+
? `\n\n## CONTINUE FROM HERE\n---\n${partialOutput.substring(Math.max(0, partialOutput.length - 1500))}\n---`
1151+
: '';
1152+
11391153
currentPrompt = {
11401154
systemContext: buildStaticPrompt(),
1141-
userMessage: buildDynamicContext(Math.floor(maxPromptTokens * 0.10)) + '\n' + message,
1155+
userMessage: buildDynamicContext(Math.floor(maxPromptTokens * 0.10)) +
1156+
(actionsSummary ? '\n\n' + actionsSummary : '') +
1157+
partialHint +
1158+
'\n\n**Context rotated. Continue the task from where you left off.**\n' + message,
11421159
};
11431160
continue;
11441161
}

main/agenticChatHelpers.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -375,8 +375,9 @@ function progressiveContextCompaction(options) {
375375
if (chatHistory) pruned += pruneVerboseHistory(chatHistory, 2);
376376
}
377377

378-
// Proactive rotation at 78% to prevent context overflow (was 80%)
379-
const shouldRotate = pct > 0.78;
378+
// Proactive rotation at 72% to prevent context stalls (lowered from 78%)
379+
// This gives more headroom for large file generation before hitting overflow
380+
const shouldRotate = pct > 0.72;
380381

381382
if (pruned > 0) {
382383
console.log(`[Context Compaction] Phase ${pct > 0.75 ? 3 : pct > 0.60 ? 2 : 1}: compacted ${pruned} items at ${Math.round(pct * 100)}% usage`);

main/llmEngine.js

Lines changed: 71 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ const { detectFamily, detectParamSize } = require('./modelDetection');
1010
const { sanitizeResponse } = require('./sanitize');
1111

1212
// ─── Constants ───
13-
const STALL_TIMEOUT_MS = 90_000;
13+
const STALL_TIMEOUT_MS = 45_000;
1414
const MAX_HISTORY_ENTRIES = 40;
1515
const GPU_INIT_TIMEOUT = 120_000;
1616
const MODEL_LOAD_TIMEOUT = 180_000;
@@ -215,18 +215,31 @@ class LLMEngine extends EventEmitter {
215215
const llamaCppPath = this._getNodeLlamaCppPath();
216216
const { getLlama, LlamaChat, InputLookupTokenPredictor } = await import(pathToFileURL(llamaCppPath).href);
217217

218-
// Cancel any in-flight generation
218+
// Cancel any in-flight generation FIRST, then wait
219219
if (this.abortController) {
220220
this.cancelGeneration('model-switch');
221221
}
222-
// Wait for generation to settle before disposing (prevents "Object is disposed" race)
222+
// Wait for active generation to fully complete before disposing
223223
if (this._activeGenerationPromise) {
224-
try { await this._activeGenerationPromise; } catch {}
224+
try {
225+
await Promise.race([
226+
this._activeGenerationPromise,
227+
new Promise(r => setTimeout(r, 3000)), // Max 3s wait for stuck generation
228+
]);
229+
} catch {}
230+
}
231+
// Extended settle time for node-llama-cpp internal async ops
232+
// (_eraseContextTokenRanges, streaming callbacks, etc.)
233+
// Increased from 500ms to 1000ms to prevent "Object is disposed" race
234+
await new Promise(r => setTimeout(r, 1000));
235+
236+
// Wrap dispose in additional try-catch for race protection
237+
try {
238+
await this._dispose();
239+
} catch (disposeErr) {
240+
const log = require('./logger');
241+
log.warn(`Dispose error (may be expected during model switch): ${disposeErr.message}`);
225242
}
226-
// Extra settle time for node-llama-cpp internal async ops (_eraseContextTokenRanges etc.)
227-
// that may still be in-flight after the generation promise resolves
228-
await new Promise(r => setTimeout(r, 500));
229-
await this._dispose();
230243

231244
if (loadSignal.aborted) throw new Error('Load cancelled');
232245

@@ -283,6 +296,16 @@ class LLMEngine extends EventEmitter {
283296
bestAutoGpuLayers = loadedModel.gpuLayers;
284297
}
285298

299+
// Reject 'cuda' mode if it loaded 0 layers despite available VRAM
300+
// This forces fallback to explicit layer counts which work better on constrained VRAM
301+
if (mode === 'cuda' && loadedModel.gpuLayers === 0 && gpuConfig.vramGB > 0.5) {
302+
const log = require('./logger');
303+
log.warn(`CUDA mode loaded 0 layers despite ${gpuConfig.vramGB.toFixed(1)}GB VRAM — trying next mode`);
304+
loadedModel.dispose?.();
305+
loadedModel = null;
306+
continue;
307+
}
308+
286309
// Now try to create context on this model
287310
const ctxTimeout = mode === false ? CTX_CREATE_TIMEOUT_CPU : CTX_CREATE_TIMEOUT_GPU;
288311
let maxCtx = this._computeMaxContext(gpuConfig.modelSizeGB);
@@ -1109,8 +1132,28 @@ class LLMEngine extends EventEmitter {
11091132
try { this.sequence.dispose?.(); } catch {}
11101133
this.sequence = null;
11111134
}
1135+
1136+
// Try to get a new sequence, with fallback to recreate context if "No sequences left"
11121137
if (this.context) {
1113-
this.sequence = this.context.getSequence();
1138+
try {
1139+
this.sequence = this.context.getSequence();
1140+
} catch (seqErr) {
1141+
const log = require('./logger');
1142+
log.warn(`getSequence failed: ${seqErr.message} — recreating context`);
1143+
1144+
// Context is exhausted, recreate it
1145+
try { this.context.dispose?.(); } catch {}
1146+
this.context = await this.model.createContext({
1147+
contextSize: { min: 512, max: this._computeMaxContext(0) },
1148+
flashAttention: true,
1149+
ignoreMemorySafetyChecks: true,
1150+
failedCreationRemedy: { retries: 4, autoContextSizeShrink: 0.5 },
1151+
});
1152+
1153+
if (this.context) {
1154+
this.sequence = this.context.getSequence();
1155+
}
1156+
}
11141157
}
11151158

11161159
if (!this.sequence || this.sequence._disposed) {
@@ -1207,7 +1250,25 @@ class LLMEngine extends EventEmitter {
12071250
};
12081251
return this.gpuInfo;
12091252
} catch {
1210-
return this.gpuInfo; // Return cached value or null
1253+
// Return default values instead of null to prevent undefined UI display
1254+
if (!this.gpuInfo) {
1255+
return {
1256+
name: 'Unknown',
1257+
memoryTotal: 0,
1258+
memoryUsed: 0,
1259+
memoryFree: 0,
1260+
memoryTotalGB: 0,
1261+
memoryUsedGB: 0,
1262+
memoryFreeGB: 0,
1263+
usagePercent: 0,
1264+
utilization: 0,
1265+
temperature: 0,
1266+
isActive: false,
1267+
gpuLayers: this.modelInfo?.gpuLayers || 0,
1268+
backend: this.modelInfo?.gpuMode || 'unknown',
1269+
};
1270+
}
1271+
return this.gpuInfo; // Return cached valid value
12111272
}
12121273
}
12131274

main/tools/toolParser.js

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,67 @@ function parseToolCalls(text) {
330330
}
331331
}
332332

333+
// Method 1.6: Truncated tool call recovery — handles incomplete JSON from maxTokens cutoff
334+
// This catches cases where Method 1.5 fails because JSON is truncated mid-content
335+
if (calls.length === 0) {
336+
// Check for unclosed fence with tool name but incomplete JSON
337+
const fenceStart = text.search(/```(?:tool_call|tool|json)/);
338+
if (fenceStart !== -1) {
339+
const afterFence = text.slice(fenceStart);
340+
const hasClosingFence = /```(?:tool_call|tool|json)[^\n]*\n[\s\S]*?```/.test(afterFence);
341+
342+
if (!hasClosingFence) {
343+
// Found unclosed fence — try to extract truncated tool call
344+
const toolNameMatch = afterFence.match(/\{\s*["']?(?:tool|name)["']?\s*:\s*["']([^"']+)["']/i);
345+
if (toolNameMatch) {
346+
const rawToolName = toolNameMatch[1].toLowerCase().replace(/-/g, '_');
347+
const toolName = TOOL_NAME_ALIASES[rawToolName] || rawToolName;
348+
349+
if (VALID_TOOLS.has(toolName)) {
350+
const call = { tool: toolName, params: {}, _truncated: true };
351+
352+
// Extract filePath if present
353+
const pathMatch = afterFence.match(/"(?:filePath|file_path|path)"\s*:\s*"([^"]+)"/i);
354+
if (pathMatch) call.params.filePath = pathMatch[1];
355+
356+
// For write_file/append_to_file, extract partial content
357+
if (toolName === 'write_file' || toolName === 'append_to_file') {
358+
const contentMatch = afterFence.match(/"content"\s*:\s*"([\s\S]*)/);
359+
if (contentMatch) {
360+
let content = contentMatch[1];
361+
// Find the actual content by removing trailing truncated chars
362+
// Look for last complete line before truncation
363+
const lines = content.split('\\n');
364+
if (lines.length > 1) {
365+
// Remove last potentially truncated line
366+
lines.pop();
367+
content = lines.join('\\n');
368+
}
369+
// Unescape JSON encoding
370+
content = content
371+
.replace(/\\n/g, '\n')
372+
.replace(/\\t/g, '\t')
373+
.replace(/\\r/g, '\r')
374+
.replace(/\\"/g, '"')
375+
.replace(/\\\\/g, '\\');
376+
call.params.content = content;
377+
}
378+
}
379+
380+
// For read_file, extract lineRange if present
381+
if (toolName === 'read_file') {
382+
const rangeMatch = afterFence.match(/"(?:lineRange|lines)"\s*:\s*\[(\d+)\s*,\s*(\d+)\]/i);
383+
if (rangeMatch) call.params.lineRange = [parseInt(rangeMatch[1]), parseInt(rangeMatch[2])];
384+
}
385+
386+
console.log(`[ToolParser] Recovered truncated ${toolName} call`);
387+
addCall(call);
388+
}
389+
}
390+
}
391+
}
392+
}
393+
333394
if (calls.length > 0) return _postProcess(calls, text);
334395

335396
// Method 1.8: OpenAI array format — [{"name":"...", "arguments":{...}}]

0 commit comments

Comments
 (0)