@@ -10,7 +10,7 @@ const { detectFamily, detectParamSize } = require('./modelDetection');
1010const { sanitizeResponse } = require ( './sanitize' ) ;
1111
1212// ─── Constants ───
13- const STALL_TIMEOUT_MS = 90_000 ;
13+ const STALL_TIMEOUT_MS = 45_000 ;
1414const MAX_HISTORY_ENTRIES = 40 ;
1515const GPU_INIT_TIMEOUT = 120_000 ;
1616const MODEL_LOAD_TIMEOUT = 180_000 ;
@@ -215,18 +215,31 @@ class LLMEngine extends EventEmitter {
215215 const llamaCppPath = this . _getNodeLlamaCppPath ( ) ;
216216 const { getLlama, LlamaChat, InputLookupTokenPredictor } = await import ( pathToFileURL ( llamaCppPath ) . href ) ;
217217
218- // Cancel any in-flight generation
218+ // Cancel any in-flight generation FIRST, then wait
219219 if ( this . abortController ) {
220220 this . cancelGeneration ( 'model-switch' ) ;
221221 }
222- // Wait for generation to settle before disposing (prevents "Object is disposed" race)
222+ // Wait for active generation to fully complete before disposing
223223 if ( this . _activeGenerationPromise ) {
224- try { await this . _activeGenerationPromise ; } catch { }
224+ try {
225+ await Promise . race ( [
226+ this . _activeGenerationPromise ,
227+ new Promise ( r => setTimeout ( r , 3000 ) ) , // Max 3s wait for stuck generation
228+ ] ) ;
229+ } catch { }
230+ }
231+ // Extended settle time for node-llama-cpp internal async ops
232+ // (_eraseContextTokenRanges, streaming callbacks, etc.)
233+ // Increased from 500ms to 1000ms to prevent "Object is disposed" race
234+ await new Promise ( r => setTimeout ( r , 1000 ) ) ;
235+
236+ // Wrap dispose in additional try-catch for race protection
237+ try {
238+ await this . _dispose ( ) ;
239+ } catch ( disposeErr ) {
240+ const log = require ( './logger' ) ;
241+ log . warn ( `Dispose error (may be expected during model switch): ${ disposeErr . message } ` ) ;
225242 }
226- // Extra settle time for node-llama-cpp internal async ops (_eraseContextTokenRanges etc.)
227- // that may still be in-flight after the generation promise resolves
228- await new Promise ( r => setTimeout ( r , 500 ) ) ;
229- await this . _dispose ( ) ;
230243
231244 if ( loadSignal . aborted ) throw new Error ( 'Load cancelled' ) ;
232245
@@ -283,6 +296,16 @@ class LLMEngine extends EventEmitter {
283296 bestAutoGpuLayers = loadedModel . gpuLayers ;
284297 }
285298
299+ // Reject 'cuda' mode if it loaded 0 layers despite available VRAM
300+ // This forces fallback to explicit layer counts which work better on constrained VRAM
301+ if ( mode === 'cuda' && loadedModel . gpuLayers === 0 && gpuConfig . vramGB > 0.5 ) {
302+ const log = require ( './logger' ) ;
303+ log . warn ( `CUDA mode loaded 0 layers despite ${ gpuConfig . vramGB . toFixed ( 1 ) } GB VRAM — trying next mode` ) ;
304+ loadedModel . dispose ?. ( ) ;
305+ loadedModel = null ;
306+ continue ;
307+ }
308+
286309 // Now try to create context on this model
287310 const ctxTimeout = mode === false ? CTX_CREATE_TIMEOUT_CPU : CTX_CREATE_TIMEOUT_GPU ;
288311 let maxCtx = this . _computeMaxContext ( gpuConfig . modelSizeGB ) ;
@@ -1109,8 +1132,28 @@ class LLMEngine extends EventEmitter {
11091132 try { this . sequence . dispose ?. ( ) ; } catch { }
11101133 this . sequence = null ;
11111134 }
1135+
1136+ // Try to get a new sequence, with fallback to recreate context if "No sequences left"
11121137 if ( this . context ) {
1113- this . sequence = this . context . getSequence ( ) ;
1138+ try {
1139+ this . sequence = this . context . getSequence ( ) ;
1140+ } catch ( seqErr ) {
1141+ const log = require ( './logger' ) ;
1142+ log . warn ( `getSequence failed: ${ seqErr . message } — recreating context` ) ;
1143+
1144+ // Context is exhausted, recreate it
1145+ try { this . context . dispose ?. ( ) ; } catch { }
1146+ this . context = await this . model . createContext ( {
1147+ contextSize : { min : 512 , max : this . _computeMaxContext ( 0 ) } ,
1148+ flashAttention : true ,
1149+ ignoreMemorySafetyChecks : true ,
1150+ failedCreationRemedy : { retries : 4 , autoContextSizeShrink : 0.5 } ,
1151+ } ) ;
1152+
1153+ if ( this . context ) {
1154+ this . sequence = this . context . getSequence ( ) ;
1155+ }
1156+ }
11141157 }
11151158
11161159 if ( ! this . sequence || this . sequence . _disposed ) {
@@ -1207,7 +1250,25 @@ class LLMEngine extends EventEmitter {
12071250 } ;
12081251 return this . gpuInfo ;
12091252 } catch {
1210- return this . gpuInfo ; // Return cached value or null
1253+ // Return default values instead of null to prevent undefined UI display
1254+ if ( ! this . gpuInfo ) {
1255+ return {
1256+ name : 'Unknown' ,
1257+ memoryTotal : 0 ,
1258+ memoryUsed : 0 ,
1259+ memoryFree : 0 ,
1260+ memoryTotalGB : 0 ,
1261+ memoryUsedGB : 0 ,
1262+ memoryFreeGB : 0 ,
1263+ usagePercent : 0 ,
1264+ utilization : 0 ,
1265+ temperature : 0 ,
1266+ isActive : false ,
1267+ gpuLayers : this . modelInfo ?. gpuLayers || 0 ,
1268+ backend : this . modelInfo ?. gpuMode || 'unknown' ,
1269+ } ;
1270+ }
1271+ return this . gpuInfo ; // Return cached valid value
12111272 }
12121273 }
12131274
0 commit comments