back to *config_path*, preserving all other keys.
Writes via a sibling temp file + atomic rename so an interrupted run
(Ctrl-C, OOM, full disk on the final flush) cannot leave the user
with a truncated config.json. Encoding is pinned to UTF-8 so the
custom `vlm.prompt` survives a round-trip on platforms with a
non-UTF-8 default locale (Windows in particular).
+
+ *key* selects which slot to write — "model" (default, primary),
+ "model_fast", "model_actions", or "model_verify" for the multipass
+ pipeline auxiliary models.
"""
with open(config_path, encoding="utf-8") as f:
cfg = json.load(f)
- cfg.setdefault("vlm", {})["model"] = model
+ cfg.setdefault("vlm", {})[key] = model
dir_name = os.path.dirname(os.path.abspath(config_path)) or "."
fd, tmp = tempfile.mkstemp(
prefix=".config.", suffix=".json.tmp", dir=dir_name,
@@ -193,6 +198,8 @@ def ensure_vlm_model(config: dict, config_path: str, *,
file=sys.stderr)
vlm["enabled"] = False
return
+ print("\n[vlm] Pick the PRIMARY model (used for Pass 2 / single-shot).",
+ file=sys.stderr)
chosen = pick_model_paginated(models)
if not chosen:
print("[vlm] No model chosen — VLM disabled for this run.",
@@ -201,9 +208,46 @@ def ensure_vlm_model(config: dict, config_path: str, *,
return
vlm["model"] = chosen
try:
- save_model_to_config(config_path, chosen)
+ save_model_to_config(config_path, chosen, key="model")
print(f"[vlm] Saved vlm.model = {chosen!r} to {config_path}",
file=sys.stderr)
except Exception as e:
print(f"[vlm] (Could not write {config_path}: {e}; using for this "
f"run only.)", file=sys.stderr)
+
+ # Multipass auxiliaries are optional. Only prompt when the run is
+ # actually configured for multipass and the slot is still empty — a
+ # pre-set value from config.json is honoured without re-asking.
+ if (vlm.get("mode") or "single").lower() != "multipass":
+ return
+
+ for slot, label, help_text in (
+ ("model_fast", "FAST",
+ "Used for Pass 1 (scene) and per-widget crop labelling. A small "
+ "model (e.g. qwen2.5vl:3b or moondream) is plenty. Skip to reuse "
+ "the primary model."),
+ ("model_actions", "ACTIONS",
+ "Used for Pass 3 (next-action candidates). No image is sent on "
+ "this pass, so a strong text-only LLM (e.g. qwen2.5:14b) is "
+ "cheaper than a VLM. Skip to reuse the primary model."),
+ ("model_verify", "VERIFY",
+ "OPTIONAL. Used for the verify pass that cross-checks pass-2 "
+ "controls against the accessibility tree. Pick a different model "
+ "family from the primary for a genuine second opinion. Skip to "
+ "leave the verify pass disabled."),
+ ):
+ if vlm.get(slot):
+ continue
+ print(f"\n[vlm] Pick the {label} model (optional). {help_text}",
+ file=sys.stderr)
+ picked = pick_model_paginated(models)
+ if not picked:
+ continue
+ vlm[slot] = picked
+ try:
+ save_model_to_config(config_path, picked, key=slot)
+ print(f"[vlm] Saved vlm.{slot} = {picked!r} to {config_path}",
+ file=sys.stderr)
+ except Exception as e:
+ print(f"[vlm] (Could not write {config_path}: {e}; using for "
+ f"this run only.)", file=sys.stderr)
diff --git a/web_inspector.py b/web_inspector.py
index 2a1c452..caaa872 100755
--- a/web_inspector.py
+++ b/web_inspector.py
@@ -189,6 +189,12 @@
}
.desc-section { margin-bottom: 20px; }
.desc-label { font-family: var(--mono); font-size: 9px; letter-spacing: 0.15em; color: var(--text-dim); text-transform: uppercase; margin-bottom: 6px; padding: 2px 8px; border-left: 2px solid var(--cyan); }
+.vlm-envelope { font-family: var(--mono); font-size: 11px; line-height: 1.5; }
+.vlm-row { display: grid; grid-template-columns: minmax(120px, max-content) 1fr; gap: 12px; padding: 4px 8px; border-bottom: 1px solid var(--border); }
+.vlm-row:last-child { border-bottom: 0; }
+.vlm-k { color: var(--text-dim); text-transform: uppercase; letter-spacing: 0.08em; font-size: 10px; padding-top: 2px; }
+.vlm-v { color: var(--text-hi); word-break: break-word; }
+.vlm-v pre { margin: 0; font-size: 10.5px; }
/* ── Sketch panel ────────────────────────────────────────────────────────── */
#sketch-panel pre {
@@ -631,6 +637,35 @@
} catch(e) { panel.innerHTML = `${esc(String(e))}`; setStatus('ERROR'); }
}
+// Render the structured envelope returned by VLM multipass mode as an
+// HTML definition list. Nested objects/arrays fall back to pretty JSON
+// so nothing is lost; the common scalar fields render as one row each.
+function renderVlmEnvelope(env) {
+ const rows = [];
+ const order = [
+ 'summary', 'app', 'screen_type', 'primary_task',
+ 'focused', 'modal_open', 'controls', 'next_actions',
+ 'sensitive_regions', 'confidence', 'discrepancies', '_passes',
+ ];
+ const seen = new Set();
+ function renderVal(v) {
+ if (v === null || v === undefined) return 'null';
+ if (typeof v === 'string') return esc(v);
+ if (typeof v === 'number' || typeof v === 'boolean') return String(v);
+ return `${esc(JSON.stringify(v, null, 2))}`;
+ }
+ for (const k of order) {
+ if (!(k in env)) continue;
+ seen.add(k);
+ rows.push(`${esc(k)}${renderVal(env[k])}
`);
+ }
+ for (const k of Object.keys(env)) {
+ if (seen.has(k)) continue;
+ rows.push(`${esc(k)}${renderVal(env[k])}
`);
+ }
+ return `${rows.join('')}
`;
+}
+
// Parse "[label]\ntext" blocks from a combined description string.
function parseDescSections(desc) {
const sections = [];
@@ -691,7 +726,24 @@
const sections = parseDescSections(data.description || '');
let html = buildSourcesHdr(caps, sections);
for (const [label, text] of sections) {
- html += `${esc(label)}
${esc(text)} `;
+ // VLM multipass returns a JSON envelope. Pretty-print it as a
+ // definition list so the structured fields are readable instead of
+ // arriving as a wall of braces. Plain-prose VLM and other sections
+ // stay in .
+ let body;
+ const trimmed = text.trim();
+ if (label.toLowerCase() === 'vlm'
+ && trimmed.startsWith('{') && trimmed.endsWith('}')) {
+ try {
+ const env = JSON.parse(trimmed);
+ body = renderVlmEnvelope(env);
+ } catch (_) {
+ body = `${esc(text)}`;
+ }
+ } else {
+ body = `${esc(text)}`;
+ }
+ html += ``;
}
panel.innerHTML = html || 'No description returned.
';
setStatus('READY');