From 16f33a6712add13abbe408da04c7f9aed3510aee Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 03:56:41 +0000 Subject: [PATCH 1/7] Elevate VLM action with four-signal grounding and multipass pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Send the VLM every signal the project already computes on the same step — the screenshot plus the accessibility tree, OCR text, ASCII sketch (with tab-index badges and legend), and focused-element hint — instead of the screenshot alone. Each block is wrapped in its own ... envelope, independently size-capped, gated by a ground_with_* config flag, and silently omitted on failure so the prompt remains valid screenshot-only. Adds a multipass mode (scene → controls → next-actions, with optional verify pass) that emits a strict JSON envelope with structured fields (summary, app, screen_type, focused, controls, next_actions, modal_open, sensitive_regions, confidence, discrepancies, per-pass timings). Tolerant JSON parsing handles fenced/garbled responses gracefully; a failed pass leaves null fields rather than aborting the call. Recommends Ollama models tuned for a 24 GB 4090 + 128 GB RAM workstation in the config and README: qwen2.5vl:7b primary, qwen2.5vl:3b for the fast pass and per-widget crop labels, with notes on the qwen2.5vl:32b premium and llama3.2-vision:11b verify configurations. The vlm_setup model picker now optionally prompts for model_fast, model_actions, and model_verify when mode=multipass; save_model_to_config takes a key= parameter to persist the auxiliary slots atomically. The web inspector renders the JSON envelope as a definition list. The legacy prompt key is honoured as prompt_single for back-compat. Adds tests/test_description_vlm.py covering JSON tolerance, image downscaling, context-block assembly with each ground_with_* gate, and multipass pass ordering / image reuse / failed-pass tolerance using a mocked urllib opener. Extends tests/test_vlm_setup.py for the new key= parameter. https://claude.ai/code/session_01VhYzhCbZ5qvmBThCH8cxLD --- README.md | 68 +++- ascii_renderer.py | 13 +- config.json.example | 35 +- description.py | 607 ++++++++++++++++++++++++++++++---- mcp_server.py | 13 +- tests/test_description_vlm.py | 383 +++++++++++++++++++++ tests/test_vlm_setup.py | 17 + tools.py | 33 +- vlm_setup.py | 52 ++- web_inspector.py | 54 ++- 10 files changed, 1177 insertions(+), 98 deletions(-) create mode 100644 tests/test_description_vlm.py diff --git a/README.md b/README.md index 341bfae..06eb056 100644 --- a/README.md +++ b/README.md @@ -211,7 +211,7 @@ chat-completions endpoint. Two common setups: | Setup | `base_url` | notes | |-------|-----------|-------| | **OpenWebUI** | `http://localhost:3000` | fronts Ollama, Anthropic, OpenAI, etc. | -| **Ollama direct** | `http://localhost:11434` | use a vision model such as `llava` or `llama3.2-vision` | +| **Ollama direct** | `http://localhost:11434` | use a vision model such as `qwen2.5vl:7b`, `llama3.2-vision`, or `minicpm-v` | | **OpenAI / LiteLLM / other** | your endpoint URL | standard `/v1` path | OSScreenObserver automatically probes `/api/v1/models` first (OpenWebUI @@ -219,34 +219,66 @@ convention) and falls back to `/v1/models` (Ollama / OpenAI convention), so pointing `base_url` straight at Ollama works without any extra configuration. -```jsonc -// config.json — OpenWebUI example -"vlm": { - "enabled": true, - "base_url": "http://localhost:3000", // OpenWebUI URL - "api_key": null, // or set $OWUI_API_KEY - "model": null, // null → pick interactively on first launch - "max_tokens": 1500 -} -``` +The VLM channel has two operating modes: + +* **`single`** — one screenshot + one prompt, optionally grounded with the + accessibility tree, OCR text, ASCII sketch, and focused-element hint as + `...` envelopes appended to the prompt. Cheap (one HTTP call) and + back-compatible with prior versions. +* **`multipass`** — a three-pass pipeline (scene → controls → next-actions) + with an optional verify pass. Returns a structured JSON envelope with + `summary`, `app`, `screen_type`, `focused`, `controls`, `next_actions`, + `modal_open`, `sensitive_regions`, and per-pass timings. The envelope + travels in the legacy `description` field as pretty-printed JSON and is + also exposed parsed under the new `vlm_structured` field for callers that + prefer not to re-parse. ```jsonc -// config.json — Ollama direct example +// config.json — Ollama direct, recommended starting configuration "vlm": { "enabled": true, - "base_url": "http://localhost:11434", // Ollama's native API port + "base_url": "http://localhost:11434", "api_key": null, - "model": "llama3.2-vision:11b", // any vision-capable model pulled in Ollama - "max_tokens": 1500 + + "model": "qwen2.5vl:7b", // primary (Pass 2 / single-shot) + "model_fast": "qwen2.5vl:3b", // Pass 1 + per-widget crop labels + "model_actions": null, // Pass 3 (no image); falls back to primary + "model_verify": null, // optional second opinion + + "mode": "multipass", // or "single" for legacy one-shot + "output_format": "json", + "max_tokens": 2000, + "temperature": 0.1, + + "ground_with_tree": true, // inject + "ground_with_ocr": true, // inject + "ground_with_sketch": true, // inject with tab badges + "ground_with_focus": true // inject } ``` +**Recommended Ollama models (24 GB RTX 4090, 128 GB RAM):** + +| Role | Model | Tag | ~VRAM | Notes | +|---|---|---|---|---| +| Primary, best overall | Qwen2.5-VL 7B | `qwen2.5vl:7b` | ~8 GB | SOTA small open VLM for UI/document tasks; strong at small fonts. | +| Primary, premium | Qwen2.5-VL 32B (Q4_K_M) | `qwen2.5vl:32b` | ~20 GB | Top-tier reasoning; fits 24 GB at Q4; slower per image. | +| Different family (verify) | Llama 3.2 Vision 11B | `llama3.2-vision:11b` | ~9 GB | Good `model_verify` pair for genuine second opinion. | +| OCR-heavy screens | MiniCPM-V 2.6 | `minicpm-v:8b` | ~7 GB | Excellent on dense text and forms. | +| Pass 1 / crop labels | Qwen2.5-VL 3B | `qwen2.5vl:3b` | ~4 GB | Cheap scene tagging; reused for the ASCII renderer's crop labeller. | +| Pass 3 (text-only) | Qwen2.5 14B | `qwen2.5:14b` | ~9 GB | Pass 3 has no image; a text-only LLM is faster than a VLM. | + +Set `OLLAMA_KEEP_ALIVE=30m` and `OLLAMA_MAX_LOADED_MODELS=2` so the primary +and fast models stay resident across multipass calls. + The first time you run `python main.py --mode inspect` with `vlm.enabled=true` and `vlm.model=null`, OSScreenObserver fetches the model list from the endpoint, shows a paginated picker, and saves your -choice back to `config.json`. In `mcp`/`both` mode the picker is -suppressed (stdin is owned by the MCP framing channel); set `vlm.model` -directly in `config.json` for non-interactive use. +choice back to `config.json`. When `mode="multipass"`, the picker also +prompts for the optional `model_fast`, `model_actions`, and `model_verify` +slots (skip any of them to reuse the primary). In `mcp`/`both` mode the +picker is suppressed (stdin is owned by the MCP framing channel); set the +model keys directly in `config.json` for non-interactive use. --- diff --git a/ascii_renderer.py b/ascii_renderer.py index 2bcb6c7..e8489c1 100755 --- a/ascii_renderer.py +++ b/ascii_renderer.py @@ -467,8 +467,15 @@ def _phash(crop: "Image.Image") -> str: def _vlm_describe_crop(crop: "Image.Image", vlm_cfg: dict) -> str: """Single-line natural-language description from an OpenWebUI-compatible - chat-completions endpoint. Returns '' on any failure.""" - if not vlm_cfg or not vlm_cfg.get("enabled") or not vlm_cfg.get("model"): + chat-completions endpoint. Returns '' on any failure. + + Prefers ``vlm.model_fast`` when set (a small/cheap VLM is plenty for + per-widget labelling); falls back to the primary ``vlm.model``. + """ + if not vlm_cfg or not vlm_cfg.get("enabled"): + return "" + model = vlm_cfg.get("model_fast") or vlm_cfg.get("model") + if not model: return "" try: import base64 as _b64 @@ -476,7 +483,7 @@ def _vlm_describe_crop(crop: "Image.Image", vlm_cfg: dict) -> str: crop.save(buf, format="PNG") b64 = _b64.b64encode(buf.getvalue()).decode() payload = { - "model": vlm_cfg["model"], + "model": model, "max_tokens": 60, "messages": [{ "role": "user", diff --git a/config.json.example b/config.json.example index ef145fd..bd6f305 100644 --- a/config.json.example +++ b/config.json.example @@ -27,14 +27,41 @@ "backend": "tesseract" }, - "_vlm": "Vision-LLM modality reached through any OpenAI-compatible chat-completions endpoint. Common values for 'base_url': 'http://localhost:3000' (OpenWebUI), 'http://localhost:11434' (Ollama direct — use a vision model such as llava or llama3.2-vision). OSScreenObserver probes /api/v1/... first (OpenWebUI convention) then falls back to /v1/... (Ollama/OpenAI convention) automatically. Leave 'enabled' false unless you have a local endpoint configured. 'base_url' should NOT include a trailing path component. 'api_key' may be left null when the endpoint accepts $OWUI_API_KEY from the environment. Pick a 'model' interactively via `python main.py --mode inspect`, or set it here directly. 'prompt' is sent verbatim alongside the screenshot; tune it for your downstream agent.", + "_vlm": "Vision-LLM modality reached through any OpenAI-compatible chat-completions endpoint. Common values for 'base_url': 'http://localhost:3000' (OpenWebUI), 'http://localhost:11434' (Ollama direct — use a vision model such as 'qwen2.5vl:7b' for best UI/screen quality, 'llama3.2-vision:11b' for a different family, or 'minicpm-v:8b' for OCR-heavy screens). OSScreenObserver probes /api/v1/... first (OpenWebUI convention) then falls back to /v1/... (Ollama/OpenAI convention) automatically. 'base_url' should NOT include a trailing path component. 'api_key' may be left null when the endpoint accepts $OWUI_API_KEY from the environment. Pick a 'model' interactively via `python main.py --mode inspect`, or set it here directly. Two operating modes — 'single' sends one screenshot + grounded prompt and returns the raw response; 'multipass' runs a three-pass scene→controls→actions pipeline (plus an optional verify pass) and returns a structured JSON envelope. The ground_with_* flags include the accessibility tree, OCR text, ASCII sketch, and focused-element hint as in-context ground truth alongside the screenshot, gated independently and silently omitted on failure.", "vlm": { "enabled": true, "base_url": "http://localhost:11434", "api_key": null, - "model": "llama3.2-vision:11b", - "max_tokens": 1500, - "prompt": "You are analyzing a computer screen for an agentic AI system. Describe what you see in structured detail: (1) What application(s) are visible? (2) What is the main content or task shown? (3) What UI controls are present and in what state? (4) What is the spatial layout? (5) What is the current focus or active element? (6) What actions would be most natural to take next? Be specific and use exact names of buttons, labels, and fields as they appear." + + "model": "qwen2.5vl:7b", + "model_fast": "qwen2.5vl:3b", + "model_actions": null, + "model_verify": null, + + "max_tokens": 2000, + "temperature": 0.1, + "timeout_s": 60, + + "mode": "multipass", + "output_format": "json", + + "ground_with_tree": true, + "ground_with_ocr": true, + "ground_with_sketch": true, + "ground_with_focus": true, + + "tree_max_lines": 80, + "ocr_max_chars": 4000, + "sketch_max_chars": 6000, + "image_max_dim": 1600, + "focused_zoom_pad": 96, + + "_prompt_legacy": "The legacy 'prompt' key remains honoured as a synonym for 'prompt_single' when mode=='single'. When unset, the built-in defaults in description.py are used; tune them per app/agent.", + "prompt_single": null, + "prompt_scene": null, + "prompt_controls": null, + "prompt_actions": null, + "prompt_verify": null }, "_ascii_sketch": "Text-sketch renderer (ascii_renderer.py). 'grid_width'/'grid_height' control the output cell dimensions; the renderer projects the window's screen-pixel bounds into that grid. 'unicode_box' uses ┌─┐│└┘ glyphs (set false for plain ASCII +-|). The fidelity toggles are all independently switchable: 'role_glyphs' enables compact [x]/(•)/▼ control representations; 'occlusion_prune' hides siblings fully covered by later-drawn siblings (modals); 'tab_index_badges' writes ①②③ into focusable elements in DFS focus order; 'landmark_headers' bakes role+name into the top border of toolbars / dialogs / status bars; 'vlm_fallback' (off by default) lets the renderer call the VLM endpoint above to label unidentified custom widgets — set true only when you have vlm.enabled=true and accept the network cost.", diff --git a/description.py b/description.py index 6ce5f82..b581cb3 100755 --- a/description.py +++ b/description.py @@ -18,6 +18,18 @@ requires vlm.base_url + vlm.model in config.json (and an api_key if the endpoint demands one). +The VLM channel has two operating modes: + + single — One screenshot + one prompt + (optionally) the accessibility + tree, OCR text, and ASCII sketch as in-context "ground truth" + blocks. Cheap (one call) and back-compatible. + + multipass — A three-pass pipeline (scene → controls → next-actions) that + returns a strict JSON envelope with structured fields suitable + for an agentic LLM consumer. An optional fourth verify pass can + cross-check the control inventory against the accessibility + tree using a second model. + These can be used individually or combined via combined(). """ @@ -26,10 +38,12 @@ import json import logging import os +import re +import time import traceback import urllib.error import urllib.request -from typing import Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple from observer import UIElement, WindowInfo @@ -74,6 +88,144 @@ def _truncate(s: str, n: int) -> str: return s if len(s) <= n else s[: n - 1] + "…" +# ───────────────────────────────────────────────────────────────────────────── +# Default prompts (overridable via config) +# ───────────────────────────────────────────────────────────────────────────── + +_DEFAULT_PROMPT_SINGLE = ( + "You are analysing a computer screen for an agentic AI system.\n" + "\n" + "The screenshot is attached. When , , " + ", or blocks are present below, treat " + "them as GROUND TRUTH for control names, states, and positions — prefer " + "them over your visual guess. Quote on-screen text verbatim from " + " when available.\n" + "\n" + "Return ONLY a JSON object with this schema (use null when a field is " + "not visible — never guess):\n" + "{\n" + ' "summary": "<=2 sentences, what is on screen and what the user ' + 'is doing",\n' + ' "app": "application name, e.g. \\"VS Code\\"",\n' + ' "screen_type": "kebab-case label, e.g. \\"code-editor\\", ' + '\\"settings-dialog\\"",\n' + ' "primary_task": "<=1 sentence",\n' + ' "focused": {"role": "...", "name": "...", "tree_id": "..."} or ' + "null,\n" + ' "controls": [{"role": "...", "name": "...", "state": "...", ' + '"selector_hint": "...", "tree_id": "..."}, ...] (<=8 entries),\n' + ' "next_actions": [{"description": "...", "target_selector": "...", ' + '"rationale": "...", "risk": "low|medium|high"}, ...] (<=3 entries),\n' + ' "modal_open": true | false | null,\n' + ' "sensitive_regions": [{"hint": "...", "bbox": [x, y, w, h]}, ...]\n' + "}\n" + "\n" + "Rules: ≤2 sentences per text field; ≤8 controls; ≤3 candidate actions. " + "Output JSON only — no prose preamble, no code fences." +) + +_DEFAULT_PROMPT_SCENE = ( + "Identify the application and screen. The screenshot is attached.\n" + "Return ONLY this JSON (use null for unknowns):\n" + '{"app": "...", "screen_type": "kebab-case", ' + '"primary_task": "<=1 sentence", "language": "BCP-47 or null"}' +) + +_DEFAULT_PROMPT_CONTROLS = ( + "You are inventorying the interactive controls on a computer screen for " + "an agentic AI system. The screenshot is attached.\n" + "\n" + "The , , and blocks below " + "are GROUND TRUTH. Use the tree's role/name/state values verbatim. Use " + "OCR text verbatim for label/value strings. Use the sketch's " + "tab-index badges and legend keys to reference positions stably.\n" + "\n" + "Return ONLY this JSON (use null when not visible — never guess):\n" + "{\n" + ' "focused": {"role": "...", "name": "...", "tree_id": "..."} or ' + "null,\n" + ' "modal_open": true | false,\n' + ' "controls": [\n' + " {\n" + ' "role": "button | menuitem | edit | checkbox | combo | ' + 'tab | link | ...",\n' + ' "name": "verbatim label",\n' + ' "state": "enabled | disabled | checked | unchecked | ' + 'selected | expanded | collapsed",\n' + ' "bbox_hint": [x, y, w, h] in screen pixels (or null),\n' + ' "selector_hint": "XPath-ish or CSS-ish selector",\n' + ' "tree_id": "id from if confident, ' + 'else null"\n' + " }\n" + " ],\n" + ' "sensitive_regions": [{"hint": "...", "bbox": [x, y, w, h]}, ...]\n' + "}\n" + "≤8 controls. Output JSON only." +) + +_DEFAULT_PROMPT_ACTIONS = ( + "Given this scene and control inventory, propose up to 3 reasonable " + "next user actions. Return ONLY this JSON:\n" + '{"next_actions": [{"description": "<=1 sentence", ' + '"target_selector": "selector from controls", ' + '"rationale": "<=1 sentence", ' + '"risk": "low | medium | high"}]}\n' + "Rules:\n" + "- target_selector MUST match a selector_hint or tree_id from the " + "supplied controls.\n" + "- Mark write/click/destructive actions as medium or high risk.\n" + "- Output JSON only." +) + +_DEFAULT_PROMPT_VERIFY = ( + "Cross-check this control inventory against the accessibility tree. " + "For each control in , decide whether a matching node exists " + "in . Return ONLY this JSON:\n" + '{"confidence": 0.0_to_1.0, "discrepancies": ' + '[{"control_index": int, "issue": "<=1 sentence"}]}' +) + + +# ───────────────────────────────────────────────────────────────────────────── +# Tolerant JSON parsing for VLM output +# ───────────────────────────────────────────────────────────────────────────── + +_FENCE_RE = re.compile(r"^```(?:json)?\s*|\s*```$", re.IGNORECASE | re.MULTILINE) + + +def _tolerant_json_loads(raw: str) -> Tuple[Optional[Dict[str, Any]], Optional[str]]: + """Best-effort JSON decode of VLM output. Returns (obj, error). + + Strategy: + 1. Strip ```json ... ``` fences and try json.loads. + 2. Fall back to the substring between the first '{' and last '}'. + 3. Return (None, error_message) on second failure. + """ + if raw is None: + return None, "empty response" + text = _FENCE_RE.sub("", raw).strip() + try: + obj = json.loads(text) + if isinstance(obj, dict): + return obj, None + except json.JSONDecodeError as e: + first_err = str(e) + else: + return None, "top-level JSON was not an object" + + # Salvage: clip to first '{' .. last '}'. + lo = text.find("{") + hi = text.rfind("}") + if lo >= 0 and hi > lo: + try: + obj = json.loads(text[lo:hi + 1]) + if isinstance(obj, dict): + return obj, None + except json.JSONDecodeError as e: + return None, f"{first_err}; salvage failed: {e}" + return None, first_err + + # ───────────────────────────────────────────────────────────────────────────── # DescriptionGenerator # ───────────────────────────────────────────────────────────────────────────── @@ -260,60 +412,192 @@ def from_ocr(self, screenshot_bytes: bytes) -> str: f"on_PATH={diag.get('path_discovered')!r}. " f"{INSTALL_HINT}]") - # ── VLM (OpenWebUI-compatible chat completions) ────────────────────────── + # ── VLM context-block helpers ──────────────────────────────────────────── + + def _build_context_blocks( + self, + root: Optional[UIElement], + screenshot_bytes: Optional[bytes], + window: Optional[WindowInfo], + ) -> str: + """Assemble the optional , , + , and envelopes that ground the VLM. - def from_vlm(self, screenshot_bytes: bytes) -> Optional[str]: + Each block is gated by its ``ground_with_*`` config flag and silently + omitted on failure or when empty — the prompt remains valid with the + screenshot alone. """ - Generate a rich description via a vision-capable LLM exposed through - an OpenAI-compatible chat-completions endpoint (OpenWebUI or Ollama). + parts: List[str] = [] - Returns the model's response string on success, or ``None`` if the - endpoint is unavailable or not configured — so callers can omit the - VLM section gracefully rather than surfacing an error to the user. - - Required config: - vlm.enabled = true - vlm.base_url = e.g. "http://localhost:11434" (Ollama) or - "http://localhost:3000" (OpenWebUI) - vlm.model = a vision-capable model id available at the endpoint - vlm.api_key = optional; falls back to $OWUI_API_KEY + want_tree = bool(self.vlm_cfg.get("ground_with_tree", True)) + want_ocr = bool(self.vlm_cfg.get("ground_with_ocr", True)) + want_sketch = bool(self.vlm_cfg.get("ground_with_sketch", True)) + want_focus = bool(self.vlm_cfg.get("ground_with_focus", True)) + + tree_max_lines = int(self.vlm_cfg.get("tree_max_lines", 80)) + ocr_max_chars = int(self.vlm_cfg.get("ocr_max_chars", 4000)) + sketch_max_chars = int(self.vlm_cfg.get("sketch_max_chars", 6000)) + + # + if want_tree and root is not None: + try: + tree_text = self.from_tree(root, window) + if tree_text and not tree_text.startswith("[Tree description failed"): + lines = tree_text.splitlines() + if len(lines) > tree_max_lines: + lines = lines[:tree_max_lines] + [ + f"… [tree truncated to {tree_max_lines} lines]" + ] + parts.append( + "\n" + + "\n".join(lines) + + "\n" + ) + except Exception as e: + logger.debug("[_build_context_blocks] tree skipped: %s", e) + + # + if (want_ocr and screenshot_bytes is not None + and self.ocr_cfg.get("enabled", True)): + try: + ocr_text = self.from_ocr(screenshot_bytes) + if ocr_text and not ocr_text.startswith("["): + if len(ocr_text) > ocr_max_chars: + ocr_text = ocr_text[:ocr_max_chars] + "… [truncated]" + parts.append( + f"\n{ocr_text}\n" + ) + except Exception as e: + logger.debug("[_build_context_blocks] ocr skipped: %s", e) + + # — lazy import so projects that disable sketch grounding + # don't pay the import cost (ascii_renderer pulls in PIL transitively). + if want_sketch and root is not None: + try: + from ascii_renderer import ASCIIRenderer + renderer = ASCIIRenderer(self.config) + ref = window.bounds if window else root.bounds + result = renderer.render_structured( + root = root, + screen_bounds = ref, + screenshot_bytes = screenshot_bytes, + ) + sketch_text = result.get("sketch") or "" + legend = result.get("legend") or {} + if legend: + legend_lines = ["LEGEND:"] + [ + f" {k}: {v}" for k, v in legend.items() + ] + sketch_text = sketch_text + "\n" + "\n".join(legend_lines) + if sketch_text and not sketch_text.startswith("[ASCII render"): + if len(sketch_text) > sketch_max_chars: + sketch_text = (sketch_text[:sketch_max_chars] + + "… [sketch truncated]") + parts.append( + f"\n{sketch_text}\n" + ) + except Exception as e: + logger.debug("[_build_context_blocks] sketch skipped: %s", e) + + # + if want_focus and root is not None: + try: + focused = _find_focused(root) + if focused is not None: + name = f' "{focused.name}"' if focused.name else "" + elem_id = getattr(focused, "element_id", None) + id_str = f" tree_id={elem_id}" if elem_id else "" + parts.append( + "\n" + f"{focused.role}{name}{id_str}\n" + "" + ) + except Exception as e: + logger.debug("[_build_context_blocks] focus skipped: %s", e) + + return "\n\n".join(parts) + + # ── VLM HTTP helpers ────────────────────────────────────────────────────── + + @staticmethod + def _prepare_image(screenshot_bytes: bytes, max_dim: int) -> bytes: + """Downscale PNG to *max_dim* on the long edge, preserving aspect. + + Returns the original bytes when Pillow is unavailable, the image is + already small enough, or any error occurs — never raises. """ - if not self.vlm_cfg.get("enabled", False): - return None + if not screenshot_bytes or max_dim <= 0: + return screenshot_bytes + try: + from PIL import Image + img = Image.open(io.BytesIO(screenshot_bytes)) + w, h = img.size + long_edge = max(w, h) + if long_edge <= max_dim: + return screenshot_bytes + scale = max_dim / long_edge + new_size = (max(1, int(w * scale)), max(1, int(h * scale))) + resized = img.resize(new_size, Image.LANCZOS) + buf = io.BytesIO() + resized.save(buf, format="PNG", optimize=True) + return buf.getvalue() + except Exception as e: + logger.debug("[_prepare_image] passthrough (%s)", e) + return screenshot_bytes - model = self.vlm_cfg.get("model") - if not model: - logger.debug("[from_vlm] vlm.model not configured — skipping VLM") + def _post_vlm( + self, + prompt: str, + screenshot_bytes: Optional[bytes], + *, + model: Optional[str] = None, + max_tokens: Optional[int] = None, + temperature: Optional[float] = None, + timeout_s: Optional[float] = None, + ) -> Optional[str]: + """Single chat-completions request. Returns assistant text or None. + + Centralises auth, redirect-refusal, prefix-fallback, and image + preparation so every pass shares one implementation. + """ + chosen_model = model or self.vlm_cfg.get("model") + if not chosen_model: + logger.debug("[_post_vlm] no model configured") return None - base_url = self.vlm_cfg.get("base_url") or "http://localhost:3000" - api_key = (self.vlm_cfg.get("api_key") - or os.environ.get("OWUI_API_KEY", "")) - prompt_txt = self.vlm_cfg.get( - "prompt", - "Describe what is on this computer screen in structured detail " - "for an AI agent.", - ) - max_tokens = self.vlm_cfg.get("max_tokens", 1500) - - b64_img = base64.b64encode(screenshot_bytes).decode() - payload = { - "model": model, - "max_tokens": max_tokens, - "messages": [{ - "role": "user", - "content": [ - {"type": "image_url", - "image_url": {"url": f"data:image/png;base64,{b64_img}"}}, - {"type": "text", "text": prompt_txt}, - ], - }], + base_url = self.vlm_cfg.get("base_url") or "http://localhost:3000" + api_key = (self.vlm_cfg.get("api_key") + or os.environ.get("OWUI_API_KEY", "")) + max_tok = max_tokens if max_tokens is not None else \ + self.vlm_cfg.get("max_tokens", 1500) + temp = (temperature if temperature is not None + else self.vlm_cfg.get("temperature", 0.1)) + timeout = float(timeout_s if timeout_s is not None + else self.vlm_cfg.get("timeout_s", 240)) + + content: List[Dict[str, Any]] = [] + if screenshot_bytes is not None: + img_max = int(self.vlm_cfg.get("image_max_dim", 1600)) + prepared = self._prepare_image(screenshot_bytes, img_max) + b64_img = base64.b64encode(prepared).decode() + content.append({ + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{b64_img}"}, + }) + content.append({"type": "text", "text": prompt}) + + payload: Dict[str, Any] = { + "model": chosen_model, + "max_tokens": max_tok, + "messages": [{"role": "user", "content": content}], } + if temp is not None: + payload["temperature"] = float(temp) + headers = {"Content-Type": "application/json"} if api_key: headers["Authorization"] = f"Bearer {api_key}" - # Try /api/v1 (OpenWebUI) first, fall back to /v1 (Ollama / OpenAI). _PREFIXES = ["/api/v1", "/v1"] opener = urllib.request.build_opener(_NoRedirectHandler) last_exc: Optional[Exception] = None @@ -324,30 +608,205 @@ def from_vlm(self, screenshot_bytes: bytes) -> Optional[str]: url, data=json.dumps(payload).encode("utf-8"), headers=headers, method="POST", ) - # urllib follows redirects by default and would silently - # convert a 302/303 POST into a GET, dropping the screenshot - # payload (and potentially forwarding it to an unintended - # host). Refuse any redirect so misconfigured vlm.base_url - # fails loudly instead. - with opener.open(req, timeout=240) as resp: + with opener.open(req, timeout=timeout) as resp: data = json.loads(resp.read().decode("utf-8")) return data["choices"][0]["message"]["content"] except urllib.error.HTTPError as e: - # 404 on the URL path suggests wrong prefix — try the next. - # Any other HTTP error is definitive; fall through to warning. + last_exc = e if e.code == 404: - last_exc = e continue - last_exc = e break except Exception as e: last_exc = e - # Connection-level failure (refused, timeout, DNS) means the - # endpoint is simply not reachable; no point trying more. break - logger.warning("[from_vlm] VLM unavailable (%s) — skipping", last_exc) + logger.warning("[_post_vlm] VLM unavailable (%s) — skipping", last_exc) return None + # ── VLM single-shot (grounded) ──────────────────────────────────────────── + + def from_vlm( + self, + screenshot_bytes: bytes, + *, + root: Optional[UIElement] = None, + window: Optional[WindowInfo] = None, + ) -> Optional[str]: + """Single-shot VLM call, optionally grounded with tree/OCR/sketch. + + Backwards-compatible: when ``root`` is None, this behaves like the + original implementation (screenshot + prompt only). When ``root`` is + supplied and the corresponding ``ground_with_*`` flags are true, the + accessibility tree, OCR text, ASCII sketch, and focused element are + appended as ``...`` envelopes after the user prompt. + + Returns the model's response string on success, or ``None`` if the + endpoint is unavailable or not configured. + """ + if not self.vlm_cfg.get("enabled", False): + return None + if not self.vlm_cfg.get("model"): + logger.debug("[from_vlm] vlm.model not configured — skipping VLM") + return None + + # Legacy "prompt" key remains honoured as a synonym for "prompt_single". + prompt_txt = ( + self.vlm_cfg.get("prompt_single") + or self.vlm_cfg.get("prompt") + or _DEFAULT_PROMPT_SINGLE + ) + + ctx = self._build_context_blocks(root, screenshot_bytes, window) + full_prompt = f"{prompt_txt}\n\n{ctx}" if ctx else prompt_txt + + return self._post_vlm(full_prompt, screenshot_bytes) + + # ── VLM multi-pass ──────────────────────────────────────────────────────── + + def from_vlm_multipass( + self, + screenshot_bytes: bytes, + *, + root: Optional[UIElement] = None, + window: Optional[WindowInfo] = None, + ) -> Optional[Dict[str, Any]]: + """Three-pass VLM pipeline returning a structured JSON envelope. + + Pass 1 (scene) — small model, screenshot only. + Pass 2 (controls) — primary model, screenshot + grounding blocks. + Pass 3 (actions) — text-only, no image; uses pass 1+2 results. + Pass V (verify) — optional second model, no image; cross-checks + pass-2 controls against the accessibility tree. + + Each pass is independently fault-tolerant: a failed pass yields null + fields in the envelope rather than aborting the call. Returns None + only when VLM is disabled or no model is configured. + """ + if not self.vlm_cfg.get("enabled", False): + return None + primary = self.vlm_cfg.get("model") + if not primary: + logger.debug("[from_vlm_multipass] vlm.model not configured") + return None + + fast = self.vlm_cfg.get("model_fast") or primary + actions = self.vlm_cfg.get("model_actions") or primary + verify = self.vlm_cfg.get("model_verify") # optional + + env: Dict[str, Any] = { + "summary": None, + "app": None, + "screen_type": None, + "primary_task": None, + "focused": None, + "controls": [], + "next_actions": [], + "modal_open": None, + "sensitive_regions": [], + "confidence": None, + "discrepancies": [], + "_passes": {}, + } + + # ── Pass 1: scene ──────────────────────────────────────────────────── + t0 = time.time() + prompt1 = self.vlm_cfg.get("prompt_scene") or _DEFAULT_PROMPT_SCENE + if window is not None: + prompt1 += (f"\n\nWindow title: {window.title!r}\n" + f"Process: {window.process_name!r}") + raw1 = self._post_vlm(prompt1, screenshot_bytes, model=fast, + max_tokens=400) + env["_passes"]["scene_ms"] = int((time.time() - t0) * 1000) + scene_obj: Dict[str, Any] = {} + if raw1: + scene_obj, err = _tolerant_json_loads(raw1) + if scene_obj is None: + scene_obj = {} + env["_passes"]["scene_error"] = err + else: + for k in ("app", "screen_type", "primary_task"): + if scene_obj.get(k) is not None: + env[k] = scene_obj[k] + + # ── Pass 2: controls (grounded) ────────────────────────────────────── + t0 = time.time() + prompt2 = self.vlm_cfg.get("prompt_controls") or _DEFAULT_PROMPT_CONTROLS + ctx = self._build_context_blocks(root, screenshot_bytes, window) + full2 = f"{prompt2}\n\n{ctx}" if ctx else prompt2 + raw2 = self._post_vlm(full2, screenshot_bytes, model=primary) + env["_passes"]["controls_ms"] = int((time.time() - t0) * 1000) + controls_obj: Dict[str, Any] = {} + if raw2: + controls_obj, err = _tolerant_json_loads(raw2) + if controls_obj is None: + controls_obj = {} + env["_passes"]["controls_error"] = err + else: + for k in ("focused", "modal_open", "controls", + "sensitive_regions"): + if controls_obj.get(k) is not None: + env[k] = controls_obj[k] + + # ── Pass 3: next-action candidates (no image) ──────────────────────── + t0 = time.time() + prompt3 = self.vlm_cfg.get("prompt_actions") or _DEFAULT_PROMPT_ACTIONS + ctx3 = ( + f"\n{json.dumps(scene_obj, ensure_ascii=False)}\n\n\n" + f"\n{json.dumps(env.get('controls') or [], ensure_ascii=False)}\n" + ) + full3 = f"{prompt3}\n\n{ctx3}" + raw3 = self._post_vlm(full3, None, model=actions, max_tokens=600) + env["_passes"]["actions_ms"] = int((time.time() - t0) * 1000) + if raw3: + actions_obj, err = _tolerant_json_loads(raw3) + if actions_obj is None: + env["_passes"]["actions_error"] = err + elif isinstance(actions_obj.get("next_actions"), list): + env["next_actions"] = actions_obj["next_actions"] + + # ── Pass V: verify (optional) ──────────────────────────────────────── + if verify and root is not None: + t0 = time.time() + promptv = self.vlm_cfg.get("prompt_verify") or _DEFAULT_PROMPT_VERIFY + tree_text = "" + try: + tree_text = self.from_tree(root, window) + except Exception: + pass + ctxv = ( + f"\n" + f"{json.dumps(env.get('controls') or [], ensure_ascii=False)}\n" + f"\n\n" + f"\n{tree_text}\n" + ) + rawv = self._post_vlm(f"{promptv}\n\n{ctxv}", None, + model=verify, max_tokens=400) + env["_passes"]["verify_ms"] = int((time.time() - t0) * 1000) + if rawv: + verify_obj, err = _tolerant_json_loads(rawv) + if verify_obj is None: + env["_passes"]["verify_error"] = err + else: + if verify_obj.get("confidence") is not None: + env["confidence"] = verify_obj["confidence"] + if isinstance(verify_obj.get("discrepancies"), list): + env["discrepancies"] = verify_obj["discrepancies"] + else: + env["_passes"]["verify_ms"] = 0 + + # ── Synthesise a human summary if pass 2 didn't supply one ────────── + if env.get("summary") is None: + bits: List[str] = [] + if env.get("app"): + bits.append(env["app"]) + if env.get("screen_type"): + bits.append(f"({env['screen_type']})") + if env.get("primary_task"): + bits.append(f"— {env['primary_task']}") + if bits: + env["summary"] = " ".join(bits) + + return env + # ── Combined ────────────────────────────────────────────────────────────── def combined( @@ -355,16 +814,40 @@ def combined( root: UIElement, screenshot_bytes: Optional[bytes], window: Optional[WindowInfo] = None, - ) -> Dict[str, str]: - """Return all enabled descriptions in a keyed dict.""" - result: Dict[str, str] = { + ) -> Dict[str, Any]: + """Return all enabled descriptions in a keyed dict. + + Keys: + - accessibility : prose serialisation of the element tree (always). + - ocr : Tesseract output (when ocr.enabled and screenshot + present). + - vlm : string form of the VLM output. In single mode this + is the raw response. In multipass mode this is the + JSON envelope serialised with json.dumps(indent=2). + - vlm_structured: when multipass mode produced an envelope, the + envelope is also exposed as a nested dict here so + structured consumers don't have to re-parse. + """ + result: Dict[str, Any] = { "accessibility": self.from_tree(root, window) } if screenshot_bytes: if self.ocr_cfg.get("enabled", True): result["ocr"] = self.from_ocr(screenshot_bytes) if self.vlm_cfg.get("enabled", False): - vlm_out = self.from_vlm(screenshot_bytes) - if vlm_out is not None: - result["vlm"] = vlm_out + mode = (self.vlm_cfg.get("mode") or "single").lower() + if mode == "multipass": + env = self.from_vlm_multipass( + screenshot_bytes, root=root, window=window, + ) + if env is not None: + result["vlm"] = json.dumps(env, indent=2, + ensure_ascii=False) + result["vlm_structured"] = env + else: + vlm_out = self.from_vlm( + screenshot_bytes, root=root, window=window, + ) + if vlm_out is not None: + result["vlm"] = vlm_out return result diff --git a/mcp_server.py b/mcp_server.py index 106024d..f528e45 100755 --- a/mcp_server.py +++ b/mcp_server.py @@ -1057,7 +1057,18 @@ def _t_description(self, hwnd, info, args) -> Dict: elif mode == "vlm": if shot is None: return {"error": "Screenshot unavailable for VLM"} - vlm_out = self.describer.from_vlm(shot) + vlm_mode = (self.describer.vlm_cfg.get("mode") or "single").lower() + if vlm_mode == "multipass": + env = self.describer.from_vlm_multipass( + shot, root=tree, window=info, + ) + if env is None: + return {"mode": mode, + "description": "[VLM unavailable — check vlm.base_url and vlm.model in config.json]"} + return {"mode": mode, + "description": json.dumps(env, indent=2, ensure_ascii=False), + "vlm_structured": env} + vlm_out = self.describer.from_vlm(shot, root=tree, window=info) if vlm_out is None: return {"mode": mode, "description": "[VLM unavailable — check vlm.base_url and vlm.model in config.json]"} return {"mode": mode, "description": vlm_out} diff --git a/tests/test_description_vlm.py b/tests/test_description_vlm.py new file mode 100644 index 0000000..2400daf --- /dev/null +++ b/tests/test_description_vlm.py @@ -0,0 +1,383 @@ +"""Tests for the VLM action in description.py. + +Covers the pieces that don't require a live Ollama endpoint: + + * _tolerant_json_loads — fenced/garbage/partial input. + * _prepare_image — downscale math and the no-op fast paths. + * _build_context_blocks — block assembly and the ``ground_with_*`` gates. + * from_vlm — grounded single-shot, with urllib.request mocked. + * from_vlm_multipass — pass ordering, image reuse on Pass 3, and + graceful pass-failure handling. +""" +from __future__ import annotations + +import io +import json +from typing import Any, Dict, List +from unittest.mock import patch, MagicMock + +import pytest + +from description import ( + DescriptionGenerator, + _tolerant_json_loads, +) +from observer import Bounds, UIElement + + +# ─── Fixtures ──────────────────────────────────────────────────────────────── + +def _elem(role, name="", focused=False, children=None, **kw) -> UIElement: + return UIElement( + element_id=kw.get("element_id", f"id-{role}"), + name=name, role=role, + value=kw.get("value"), + bounds=kw.get("bounds", Bounds(0, 0, 100, 100)), + enabled=kw.get("enabled", True), + focused=focused, + keyboard_shortcut=kw.get("keyboard_shortcut"), + description=kw.get("description"), + children=children or [], + ) + + +def _png_bytes(w=200, h=100) -> bytes: + """Generate a real PNG so Pillow operations work.""" + from PIL import Image + img = Image.new("RGB", (w, h), color=(120, 200, 80)) + buf = io.BytesIO() + img.save(buf, format="PNG") + return buf.getvalue() + + +@pytest.fixture +def cfg_vlm() -> dict: + return { + "vlm": { + "enabled": True, + "base_url": "http://localhost:11434", + "api_key": None, + "model": "qwen2.5vl:7b", + "model_fast": "qwen2.5vl:3b", + "max_tokens": 800, + "temperature": 0.1, + "timeout_s": 30, + "mode": "multipass", + "ground_with_tree": True, + "ground_with_ocr": False, + "ground_with_sketch": False, + "ground_with_focus": True, + "tree_max_lines": 20, + }, + "ocr": {"enabled": False}, + } + + +# ─── _tolerant_json_loads ──────────────────────────────────────────────────── + +def test_tolerant_json_loads_plain(): + obj, err = _tolerant_json_loads('{"app": "VS Code", "controls": []}') + assert err is None + assert obj == {"app": "VS Code", "controls": []} + + +def test_tolerant_json_loads_strips_json_fence(): + raw = '```json\n{"app": "VS Code"}\n```' + obj, err = _tolerant_json_loads(raw) + assert err is None and obj == {"app": "VS Code"} + + +def test_tolerant_json_loads_strips_bare_fence(): + raw = '```\n{"a": 1}\n```' + obj, err = _tolerant_json_loads(raw) + assert err is None and obj == {"a": 1} + + +def test_tolerant_json_loads_salvages_from_prose(): + raw = "Here is the JSON you asked for:\n{\"app\": \"Slack\"}\nthanks!" + obj, err = _tolerant_json_loads(raw) + assert err is None and obj == {"app": "Slack"} + + +def test_tolerant_json_loads_rejects_non_object(): + obj, err = _tolerant_json_loads("[1, 2, 3]") + assert obj is None + assert err and "not an object" in err.lower() + + +def test_tolerant_json_loads_garbage_returns_error(): + obj, err = _tolerant_json_loads("definitely not json") + assert obj is None + assert err # non-empty error message + + +def test_tolerant_json_loads_empty(): + obj, err = _tolerant_json_loads("") + assert obj is None and err + + +def test_tolerant_json_loads_none(): + obj, err = _tolerant_json_loads(None) # type: ignore[arg-type] + assert obj is None and err + + +# ─── _prepare_image ────────────────────────────────────────────────────────── + +def test_prepare_image_passes_through_when_small(): + src = _png_bytes(800, 600) + out = DescriptionGenerator._prepare_image(src, max_dim=1600) + assert out is src # no-op fast path returns the same object + + +def test_prepare_image_downscales_long_edge_to_max_dim(): + src = _png_bytes(3200, 1600) + out = DescriptionGenerator._prepare_image(src, max_dim=1600) + assert out is not src + from PIL import Image + img = Image.open(io.BytesIO(out)) + assert max(img.size) == 1600 + # Aspect ratio preserved (2:1 → 1600x800). + assert img.size == (1600, 800) + + +def test_prepare_image_zero_max_dim_passes_through(): + src = _png_bytes(3200, 1600) + out = DescriptionGenerator._prepare_image(src, max_dim=0) + assert out is src + + +def test_prepare_image_empty_input(): + out = DescriptionGenerator._prepare_image(b"", max_dim=1600) + assert out == b"" + + +# ─── _build_context_blocks ─────────────────────────────────────────────────── + +def test_build_context_blocks_emits_tree_and_focus(cfg_vlm): + gen = DescriptionGenerator(cfg_vlm) + root = _elem("Window", "Editor", children=[ + _elem("Button", "OK", focused=True), + _elem("Button", "Cancel"), + ]) + out = gen._build_context_blocks(root, None, None) + assert "" in out + assert "" in out + assert "" in out + assert "OK" in out # focused element name surfaces + + +def test_build_context_blocks_omits_blocks_when_flags_off(cfg_vlm): + cfg_vlm["vlm"]["ground_with_tree"] = False + cfg_vlm["vlm"]["ground_with_focus"] = False + gen = DescriptionGenerator(cfg_vlm) + root = _elem("Window", focused=True) + out = gen._build_context_blocks(root, None, None) + assert "" not in out + assert "" not in out + + +def test_build_context_blocks_truncates_tree(cfg_vlm): + cfg_vlm["vlm"]["tree_max_lines"] = 3 + gen = DescriptionGenerator(cfg_vlm) + # Build a long tree. + kids = [_elem("Button", f"B{i}") for i in range(40)] + root = _elem("Window", "many", children=kids) + out = gen._build_context_blocks(root, None, None) + assert "tree truncated" in out + + +def test_build_context_blocks_handles_no_root(cfg_vlm): + gen = DescriptionGenerator(cfg_vlm) + out = gen._build_context_blocks(None, None, None) + assert out == "" + + +# ─── _post_vlm transport (mocked HTTP) ─────────────────────────────────────── + +class _FakeResp: + def __init__(self, body: bytes): + self._body = body + def __enter__(self): return self + def __exit__(self, *_): return False + def read(self): return self._body + + +def _mk_resp(content: str) -> _FakeResp: + return _FakeResp(json.dumps( + {"choices": [{"message": {"content": content}}]} + ).encode("utf-8")) + + +def test_post_vlm_returns_assistant_text(cfg_vlm): + gen = DescriptionGenerator(cfg_vlm) + captured: Dict[str, Any] = {} + def fake_open(req, timeout=0): + captured["url"] = req.full_url + captured["body"] = json.loads(req.data.decode("utf-8")) + return _mk_resp("hello") + fake_opener = MagicMock() + fake_opener.open.side_effect = fake_open + with patch("description.urllib.request.build_opener", return_value=fake_opener): + out = gen._post_vlm("hi", _png_bytes()) + assert out == "hello" + assert captured["body"]["model"] == "qwen2.5vl:7b" + assert captured["body"]["temperature"] == 0.1 + # Image was attached. + contents = captured["body"]["messages"][0]["content"] + assert any(c.get("type") == "image_url" for c in contents) + + +def test_post_vlm_returns_none_when_model_unset(cfg_vlm): + cfg_vlm["vlm"]["model"] = None + gen = DescriptionGenerator(cfg_vlm) + assert gen._post_vlm("hi", None) is None + + +# ─── from_vlm (single-shot, grounded) ──────────────────────────────────────── + +def test_from_vlm_single_attaches_grounding(cfg_vlm): + cfg_vlm["vlm"]["mode"] = "single" + gen = DescriptionGenerator(cfg_vlm) + root = _elem("Window", "Editor", children=[_elem("Button", "OK")]) + + captured: Dict[str, Any] = {} + def fake_open(req, timeout=0): + captured["body"] = json.loads(req.data.decode("utf-8")) + return _mk_resp("structured output") + fake_opener = MagicMock() + fake_opener.open.side_effect = fake_open + with patch("description.urllib.request.build_opener", return_value=fake_opener): + out = gen.from_vlm(_png_bytes(), root=root) + assert out == "structured output" + text_block = next(c for c in captured["body"]["messages"][0]["content"] + if c.get("type") == "text")["text"] + assert "" in text_block + + +def test_from_vlm_disabled_returns_none(cfg_vlm): + cfg_vlm["vlm"]["enabled"] = False + gen = DescriptionGenerator(cfg_vlm) + assert gen.from_vlm(_png_bytes()) is None + + +# ─── from_vlm_multipass — pass ordering and image reuse ────────────────────── + +def test_multipass_runs_three_passes_in_order(cfg_vlm): + gen = DescriptionGenerator(cfg_vlm) + root = _elem("Window", "App", children=[_elem("Button", "OK", focused=True)]) + + calls: List[Dict[str, Any]] = [] + def fake_open(req, timeout=0): + body = json.loads(req.data.decode("utf-8")) + contents = body["messages"][0]["content"] + has_image = any(c.get("type") == "image_url" for c in contents) + text = next(c for c in contents if c.get("type") == "text")["text"] + calls.append({"model": body["model"], "has_image": has_image, + "text_head": text[:80]}) + n = len(calls) + # Pass 1 (scene), Pass 2 (controls), Pass 3 (actions). + if n == 1: + content = '{"app": "VS Code", "screen_type": "code-editor", ' \ + '"primary_task": "Editing"}' + elif n == 2: + content = '{"focused": {"role": "button", "name": "OK"}, ' \ + '"modal_open": false, "controls": ' \ + '[{"role": "button", "name": "OK"}]}' + else: + content = '{"next_actions": [{"description": "Click OK", ' \ + '"target_selector": "//button[@name=\'OK\']", ' \ + '"rationale": "primary action", "risk": "low"}]}' + return _mk_resp(content) + + fake_opener = MagicMock() + fake_opener.open.side_effect = fake_open + with patch("description.urllib.request.build_opener", return_value=fake_opener): + env = gen.from_vlm_multipass(_png_bytes(), root=root) + + assert env is not None + # Three passes, in order. + assert len(calls) == 3 + assert calls[0]["model"] == "qwen2.5vl:3b" # fast for Pass 1 + assert calls[1]["model"] == "qwen2.5vl:7b" # primary for Pass 2 + assert calls[2]["model"] == "qwen2.5vl:7b" # fallback for Pass 3 + # Image attached for Pass 1 + 2 only (Pass 3 is text-only). + assert [c["has_image"] for c in calls] == [True, True, False] + + # Envelope merges fields from all three passes. + assert env["app"] == "VS Code" + assert env["screen_type"] == "code-editor" + assert env["focused"] == {"role": "button", "name": "OK"} + assert env["modal_open"] is False + assert len(env["controls"]) == 1 + assert len(env["next_actions"]) == 1 + # Timing markers populated. + assert env["_passes"]["scene_ms"] >= 0 + assert env["_passes"]["controls_ms"] >= 0 + assert env["_passes"]["actions_ms"] >= 0 + + +def test_multipass_tolerates_failed_pass(cfg_vlm): + """A garbled response from one pass leaves null fields, never aborts.""" + gen = DescriptionGenerator(cfg_vlm) + root = _elem("Window", "App", children=[_elem("Button", "OK")]) + + call_n = {"n": 0} + def fake_open(req, timeout=0): + call_n["n"] += 1 + # Pass 1 returns garbage; later passes return valid JSON. + if call_n["n"] == 1: + return _mk_resp("not json at all, sorry") + if call_n["n"] == 2: + return _mk_resp('{"controls": [{"role": "button"}]}') + return _mk_resp('{"next_actions": []}') + + fake_opener = MagicMock() + fake_opener.open.side_effect = fake_open + with patch("description.urllib.request.build_opener", return_value=fake_opener): + env = gen.from_vlm_multipass(_png_bytes(), root=root) + assert env is not None + # Scene pass failed → fields stay None, but envelope is still returned + # and the later passes still landed their fields. + assert env["app"] is None + assert env["_passes"].get("scene_error") + assert env["controls"] == [{"role": "button"}] + + +def test_multipass_returns_none_when_disabled(cfg_vlm): + cfg_vlm["vlm"]["enabled"] = False + gen = DescriptionGenerator(cfg_vlm) + assert gen.from_vlm_multipass(_png_bytes()) is None + + +# ─── combined() routes to single vs multipass ──────────────────────────────── + +def test_combined_multipass_exposes_structured(cfg_vlm): + gen = DescriptionGenerator(cfg_vlm) + root = _elem("Window", "App", children=[_elem("Button", "OK")]) + + def fake_open(req, timeout=0): + return _mk_resp('{"app": "VS Code"}') + fake_opener = MagicMock() + fake_opener.open.side_effect = fake_open + with patch("description.urllib.request.build_opener", return_value=fake_opener): + out = gen.combined(root, _png_bytes()) + assert "vlm" in out + assert "vlm_structured" in out + assert isinstance(out["vlm_structured"], dict) + # The string form is JSON-parseable. + json.loads(out["vlm"]) + + +def test_combined_single_mode_no_structured(cfg_vlm): + cfg_vlm["vlm"]["mode"] = "single" + gen = DescriptionGenerator(cfg_vlm) + root = _elem("Window", "App") + + def fake_open(req, timeout=0): + return _mk_resp("plain prose response") + fake_opener = MagicMock() + fake_opener.open.side_effect = fake_open + with patch("description.urllib.request.build_opener", return_value=fake_opener): + out = gen.combined(root, _png_bytes()) + assert out.get("vlm") == "plain prose response" + assert "vlm_structured" not in out diff --git a/tests/test_vlm_setup.py b/tests/test_vlm_setup.py index ba08ab6..9a61b03 100644 --- a/tests/test_vlm_setup.py +++ b/tests/test_vlm_setup.py @@ -72,3 +72,20 @@ def test_save_model_creates_vlm_section_if_missing(tmp_path): on_disk = json.load(f) assert on_disk["vlm"] == {"model": "some/model"} assert on_disk["web_ui"]["host"] == "127.0.0.1" + + +def test_save_model_writes_alternate_slot(tmp_path): + """The optional ``key=`` parameter persists multipass auxiliary models + (model_fast, model_actions, model_verify) without clobbering the + primary ``model`` slot.""" + cfg, path = _cfg(tmp_path, enabled=True, model="qwen2.5vl:7b") + vlm_setup.save_model_to_config(path, "qwen2.5vl:3b", key="model_fast") + vlm_setup.save_model_to_config(path, "llama3.2-vision:11b", + key="model_verify") + with open(path, encoding="utf-8") as f: + on_disk = json.load(f) + assert on_disk["vlm"]["model"] == "qwen2.5vl:7b" + assert on_disk["vlm"]["model_fast"] == "qwen2.5vl:3b" + assert on_disk["vlm"]["model_verify"] == "llama3.2-vision:11b" + # UTF-8 prompt round-trips alongside the new keys. + assert on_disk["vlm"]["prompt"] == "Describe — naïvely 😀" diff --git a/tools.py b/tools.py index 009e24c..27ef2e1 100644 --- a/tools.py +++ b/tools.py @@ -1421,15 +1421,35 @@ def get_screen_description(ctx: ToolContext, args: Dict[str, Any]) -> Dict[str, except Exception as e: logger.exception("[get_screen_description] OCR failed: %s", e) - # VLM — attempted when enabled in config. + # VLM — attempted when enabled in config. In multipass mode the VLM + # output is a structured envelope; the JSON-serialised form is folded + # into the concatenated body (for back-compat with the legacy text + # description) and the parsed dict is returned separately under + # ``vlm_structured`` so callers don't have to re-parse it. + vlm_structured: Any = None vlm_enabled = (ctx.config.get("vlm", {}) or {}).get("enabled", False) if vlm_enabled: try: shot = ctx.observer.get_screenshot(info.handle) if shot: - vlm_out = ctx.describer.from_vlm(shot) - if vlm_out is not None: - parts["vlm"] = vlm_out + vlm_mode = ( + (ctx.config.get("vlm", {}) or {}).get("mode") or "single" + ).lower() + if vlm_mode == "multipass": + env = ctx.describer.from_vlm_multipass( + shot, root=sub, window=info, + ) + if env is not None: + import json as _json + parts["vlm"] = _json.dumps(env, indent=2, + ensure_ascii=False) + vlm_structured = env + else: + vlm_out = ctx.describer.from_vlm( + shot, root=sub, window=info, + ) + if vlm_out is not None: + parts["vlm"] = vlm_out else: logger.warning("[get_screen_description] screenshot unavailable for VLM") except Exception as e: @@ -1448,7 +1468,7 @@ def get_screen_description(ctx: ToolContext, args: Dict[str, Any]) -> Dict[str, body = body[:char_cap] + "… [truncated]" truncated = True - return { + result: Dict[str, Any] = { "ok": True, "success": True, "step_id": step_id, "caused_by_step_id": caused_by, "window": info.title, "window_uid": info.window_uid, @@ -1456,6 +1476,9 @@ def get_screen_description(ctx: ToolContext, args: Dict[str, Any]) -> Dict[str, "description": body, "truncated": truncated, } + if vlm_structured is not None: + result["vlm_structured"] = vlm_structured + return result # ─── Dispatcher ─────────────────────────────────────────────────────────────── diff --git a/vlm_setup.py b/vlm_setup.py index 41ddcf9..0c26b91 100644 --- a/vlm_setup.py +++ b/vlm_setup.py @@ -128,18 +128,23 @@ def pick_model_paginated(models: List[str]) -> Optional[str]: return raw -def save_model_to_config(config_path: str, model: str) -> None: - """Persist vlm.model back to *config_path*, preserving all other keys. +def save_model_to_config(config_path: str, model: str, + *, key: str = "model") -> None: + """Persist vlm. back to *config_path*, preserving all other keys. Writes via a sibling temp file + atomic rename so an interrupted run (Ctrl-C, OOM, full disk on the final flush) cannot leave the user with a truncated config.json. Encoding is pinned to UTF-8 so the custom `vlm.prompt` survives a round-trip on platforms with a non-UTF-8 default locale (Windows in particular). + + *key* selects which slot to write — "model" (default, primary), + "model_fast", "model_actions", or "model_verify" for the multipass + pipeline auxiliary models. """ with open(config_path, encoding="utf-8") as f: cfg = json.load(f) - cfg.setdefault("vlm", {})["model"] = model + cfg.setdefault("vlm", {})[key] = model dir_name = os.path.dirname(os.path.abspath(config_path)) or "." fd, tmp = tempfile.mkstemp( prefix=".config.", suffix=".json.tmp", dir=dir_name, @@ -193,6 +198,8 @@ def ensure_vlm_model(config: dict, config_path: str, *, file=sys.stderr) vlm["enabled"] = False return + print("\n[vlm] Pick the PRIMARY model (used for Pass 2 / single-shot).", + file=sys.stderr) chosen = pick_model_paginated(models) if not chosen: print("[vlm] No model chosen — VLM disabled for this run.", @@ -201,9 +208,46 @@ def ensure_vlm_model(config: dict, config_path: str, *, return vlm["model"] = chosen try: - save_model_to_config(config_path, chosen) + save_model_to_config(config_path, chosen, key="model") print(f"[vlm] Saved vlm.model = {chosen!r} to {config_path}", file=sys.stderr) except Exception as e: print(f"[vlm] (Could not write {config_path}: {e}; using for this " f"run only.)", file=sys.stderr) + + # Multipass auxiliaries are optional. Only prompt when the run is + # actually configured for multipass and the slot is still empty — a + # pre-set value from config.json is honoured without re-asking. + if (vlm.get("mode") or "single").lower() != "multipass": + return + + for slot, label, help_text in ( + ("model_fast", "FAST", + "Used for Pass 1 (scene) and per-widget crop labelling. A small " + "model (e.g. qwen2.5vl:3b or moondream) is plenty. Skip to reuse " + "the primary model."), + ("model_actions", "ACTIONS", + "Used for Pass 3 (next-action candidates). No image is sent on " + "this pass, so a strong text-only LLM (e.g. qwen2.5:14b) is " + "cheaper than a VLM. Skip to reuse the primary model."), + ("model_verify", "VERIFY", + "OPTIONAL. Used for the verify pass that cross-checks pass-2 " + "controls against the accessibility tree. Pick a different model " + "family from the primary for a genuine second opinion. Skip to " + "leave the verify pass disabled."), + ): + if vlm.get(slot): + continue + print(f"\n[vlm] Pick the {label} model (optional). {help_text}", + file=sys.stderr) + picked = pick_model_paginated(models) + if not picked: + continue + vlm[slot] = picked + try: + save_model_to_config(config_path, picked, key=slot) + print(f"[vlm] Saved vlm.{slot} = {picked!r} to {config_path}", + file=sys.stderr) + except Exception as e: + print(f"[vlm] (Could not write {config_path}: {e}; using for " + f"this run only.)", file=sys.stderr) diff --git a/web_inspector.py b/web_inspector.py index 2a1c452..caaa872 100755 --- a/web_inspector.py +++ b/web_inspector.py @@ -189,6 +189,12 @@ } .desc-section { margin-bottom: 20px; } .desc-label { font-family: var(--mono); font-size: 9px; letter-spacing: 0.15em; color: var(--text-dim); text-transform: uppercase; margin-bottom: 6px; padding: 2px 8px; border-left: 2px solid var(--cyan); } +.vlm-envelope { font-family: var(--mono); font-size: 11px; line-height: 1.5; } +.vlm-row { display: grid; grid-template-columns: minmax(120px, max-content) 1fr; gap: 12px; padding: 4px 8px; border-bottom: 1px solid var(--border); } +.vlm-row:last-child { border-bottom: 0; } +.vlm-k { color: var(--text-dim); text-transform: uppercase; letter-spacing: 0.08em; font-size: 10px; padding-top: 2px; } +.vlm-v { color: var(--text-hi); word-break: break-word; } +.vlm-v pre { margin: 0; font-size: 10.5px; } /* ── Sketch panel ────────────────────────────────────────────────────────── */ #sketch-panel pre { @@ -631,6 +637,35 @@ } catch(e) { panel.innerHTML = `
${esc(String(e))}
`; setStatus('ERROR'); } } +// Render the structured envelope returned by VLM multipass mode as an +// HTML definition list. Nested objects/arrays fall back to pretty JSON +// so nothing is lost; the common scalar fields render as one row each. +function renderVlmEnvelope(env) { + const rows = []; + const order = [ + 'summary', 'app', 'screen_type', 'primary_task', + 'focused', 'modal_open', 'controls', 'next_actions', + 'sensitive_regions', 'confidence', 'discrepancies', '_passes', + ]; + const seen = new Set(); + function renderVal(v) { + if (v === null || v === undefined) return 'null'; + if (typeof v === 'string') return esc(v); + if (typeof v === 'number' || typeof v === 'boolean') return String(v); + return `
${esc(JSON.stringify(v, null, 2))}
`; + } + for (const k of order) { + if (!(k in env)) continue; + seen.add(k); + rows.push(`
${esc(k)}${renderVal(env[k])}
`); + } + for (const k of Object.keys(env)) { + if (seen.has(k)) continue; + rows.push(`
${esc(k)}${renderVal(env[k])}
`); + } + return `
${rows.join('')}
`; +} + // Parse "[label]\ntext" blocks from a combined description string. function parseDescSections(desc) { const sections = []; @@ -691,7 +726,24 @@ const sections = parseDescSections(data.description || ''); let html = buildSourcesHdr(caps, sections); for (const [label, text] of sections) { - html += `
${esc(label)}
${esc(text)}
`; + // VLM multipass returns a JSON envelope. Pretty-print it as a + // definition list so the structured fields are readable instead of + // arriving as a wall of braces. Plain-prose VLM and other sections + // stay in
.
+      let body;
+      const trimmed = text.trim();
+      if (label.toLowerCase() === 'vlm'
+          && trimmed.startsWith('{') && trimmed.endsWith('}')) {
+        try {
+          const env = JSON.parse(trimmed);
+          body = renderVlmEnvelope(env);
+        } catch (_) {
+          body = `
${esc(text)}
`; + } + } else { + body = `
${esc(text)}
`; + } + html += `
${esc(label)}
${body}
`; } panel.innerHTML = html || '
No description returned.
'; setStatus('READY'); From 527b2d465428bc0e85db28ea3f6968612de4aa30 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 04:03:07 +0000 Subject: [PATCH 2/7] Add Ollama runner detection and automatic model pull on startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds ollama_setup.py with three responsibilities wired into the --mode inspect startup in main.py: 1. Runner detection: asks the user (once) how to invoke the Ollama CLI — native 'ollama', 'docker exec ollama', or any custom prefix. Running Docker containers with 'ollama' in their name are auto-discovered and offered as quick-pick options. The chosen prefix is saved as vlm.ollama_runner in config.json so subsequent launches skip the question. 2. Model-role summary: prints which of the four VLM model slots are configured and what each slot is used for: model — primary (Pass 2 / single-shot) model_fast — Pass 1 scene tag + per-widget crop labels model_actions — Pass 3 next-action candidates (text-only LLM OK) model_verify — optional verify pass (second model family) 3. Auto-pull: runs ' list', compares against configured model IDs, and pulls any missing Ollama models while streaming pull progress to stderr. Cloud-namespaced IDs (anthropic/, openai/, etc.) are skipped. Duplicate model IDs across slots are de-duplicated. Pull failures print a warning but do not abort startup. Adds vlm.ollama_runner to config.json.example with documentation. Adds tests/test_ollama_setup.py covering all deterministic paths (runner detection, model collection, cloud-model skipping, list parsing, deduplication, pull targeting, disabled/empty runner bypass). https://claude.ai/code/session_01VhYzhCbZ5qvmBThCH8cxLD --- config.json.example | 3 + main.py | 14 ++ ollama_setup.py | 394 +++++++++++++++++++++++++++++++++++++ tests/test_ollama_setup.py | 237 ++++++++++++++++++++++ 4 files changed, 648 insertions(+) create mode 100644 ollama_setup.py create mode 100644 tests/test_ollama_setup.py diff --git a/config.json.example b/config.json.example index bd6f305..b395462 100644 --- a/config.json.example +++ b/config.json.example @@ -56,6 +56,9 @@ "image_max_dim": 1600, "focused_zoom_pad": 96, + "_ollama_runner": "How to invoke the Ollama CLI. Leave null and set vlm.enabled=true, then run `python main.py --mode inspect` once — OSScreenObserver will ask you interactively and save the choice here. Common values: [] to skip auto-pull; ['ollama'] for a native install; ['docker', 'exec', 'my_container', 'ollama'] for a Docker-based Ollama. On startup, OSScreenObserver checks which configured models are present in the local Ollama library and pulls any that are missing, printing pull progress to stderr.", + "ollama_runner": null, + "_prompt_legacy": "The legacy 'prompt' key remains honoured as a synonym for 'prompt_single' when mode=='single'. When unset, the built-in defaults in description.py are used; tune them per app/agent.", "prompt_single": null, "prompt_scene": null, diff --git a/main.py b/main.py index 541a598..184667b 100644 --- a/main.py +++ b/main.py @@ -236,6 +236,20 @@ def main() -> None: except Exception as e: logger.warning(f"[main] VLM setup skipped: {e}") + # ── Ollama model pull ──────────────────────────────────────────────────── + # Only runs in inspect mode (where stdin is a TTY). Asks once how the + # Ollama CLI should be invoked (native, Docker, custom), then checks + # which configured model slots are locally available and pulls any that + # are missing, streaming pull progress to stderr. + try: + from ollama_setup import ensure_models + ensure_models( + config, args.config, + interactive_ok=(args.mode == "inspect"), + ) + except Exception as e: + logger.warning(f"[main] Ollama model setup skipped: {e}") + # ── Lazy imports (so logging is configured before module-level init runs) try: from observer import ScreenObserver diff --git a/ollama_setup.py b/ollama_setup.py new file mode 100644 index 0000000..3f7b461 --- /dev/null +++ b/ollama_setup.py @@ -0,0 +1,394 @@ +""" +ollama_setup.py — Ollama runner detection, model inventory, and auto-pull. + +Called from main.py in --mode inspect (interactive) when vlm.enabled=true +and the configured base_url points to a local Ollama instance. + +Responsibilities +──────────────── +1. Ask the user (once) how to invoke the ollama CLI: + a) directly — ``ollama`` + b) inside a running Docker container — + ``docker exec ollama`` + c) any custom prefix the user types + + The chosen prefix is saved as ``vlm.ollama_runner`` in config.json so + subsequent launches skip the question. + +2. Discover which model slots are configured in the vlm section: + vlm.model — primary (Pass 2 / single-shot) + vlm.model_fast — Pass 1 + per-widget crop labels + vlm.model_actions — Pass 3 (text-only; can be a non-vision LLM) + vlm.model_verify — optional verify pass (different family recommended) + +3. Run `` list`` to get locally-available models. + +4. Pull any configured model that is not already present, printing a one-line + progress indicator per model. Non-Ollama model identifiers (those + containing a ``/`` namespace prefix such as ``anthropic/claude-3-5-sonnet`` + or ``openai/gpt-4o``) are skipped — they are not pullable via the Ollama + CLI and are assumed to be available through the configured base_url API. + +5. Return success/failure quietly — a pull failure prints a warning but does + not abort startup; the model will fail at inference time with a clear error. +""" + +from __future__ import annotations + +import re +import shlex +import subprocess +import sys +import tempfile +import json +import os +from typing import List, Optional, Set, Tuple + +_DOCKER_CONTAINER_RE = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9_.-]*$") + + +# ───────────────────────────────────────────────────────────────────────────── +# Runner detection +# ───────────────────────────────────────────────────────────────────────────── + +def _test_runner(runner_prefix: List[str]) -> bool: + """Return True if `` list`` exits with code 0.""" + cmd = runner_prefix + ["list"] + try: + r = subprocess.run( + cmd, capture_output=True, timeout=10, + ) + return r.returncode == 0 + except (FileNotFoundError, OSError, subprocess.TimeoutExpired): + return False + + +def _detect_docker_containers() -> List[str]: + """Return names of running Docker containers that have 'ollama' in their + name, so we can offer them as quick-pick options.""" + try: + r = subprocess.run( + ["docker", "ps", "--format", "{{.Names}}"], + capture_output=True, text=True, timeout=8, + ) + if r.returncode != 0: + return [] + return [n.strip() for n in r.stdout.splitlines() + if n.strip() and "ollama" in n.lower()] + except Exception: + return [] + + +def _ask_runner() -> Optional[List[str]]: + """Interactively ask the user how to run the Ollama CLI. + + Returns a list of tokens (the runner prefix) or None if the user skips. + """ + print( + "\n[ollama_setup] How should OSScreenObserver invoke the Ollama CLI?", + file=sys.stderr, + ) + + options: List[Tuple[str, List[str]]] = [] + + # Option 1: native ollama on PATH + if _test_runner(["ollama"]): + options.append(("ollama (detected on PATH)", ["ollama"])) + + # Option 2: Docker containers named with 'ollama' + containers = _detect_docker_containers() + for c in containers: + prefix = ["docker", "exec", c, "ollama"] + label = f"docker exec {c} ollama (container running)" + options.append((label, prefix)) + + options.append(("Custom — type your own prefix", None)) + options.append(("Skip — do not pull models automatically", [])) + + for i, (label, _) in enumerate(options): + print(f" {i + 1}. {label}", file=sys.stderr) + + while True: + try: + raw = input(" Select [1]: ").strip() + except (EOFError, KeyboardInterrupt): + return None + if raw == "": + raw = "1" + if raw.isdigit(): + idx = int(raw) - 1 + if 0 <= idx < len(options): + _, prefix = options[idx] + if prefix is None: + # Custom entry + try: + custom = input( + " Enter the full prefix (e.g. " + "'docker exec my_ollama ollama'): " + ).strip() + except (EOFError, KeyboardInterrupt): + return None + if not custom: + print(" (empty — skipping)", file=sys.stderr) + return [] + tokens = shlex.split(custom) + if not _test_runner(tokens): + print( + f" WARNING: '{' '.join(tokens)} list' did not " + f"succeed — check the prefix and try again.", + file=sys.stderr, + ) + try: + ok = input(" Use it anyway? [y/N] ").strip().lower() + except (EOFError, KeyboardInterrupt): + return None + if ok != "y": + continue + return tokens + if prefix == []: # skip + return [] + return prefix + print(f" Please enter a number 1–{len(options)}.", file=sys.stderr) + + +def ensure_runner(config: dict, config_path: str, *, + interactive_ok: bool) -> List[str]: + """Return the Ollama runner prefix (list of tokens). + + If ``vlm.ollama_runner`` is already in config, return that. + Otherwise, when interactive_ok, ask and save. Otherwise return []. + """ + vlm = config.get("vlm") or {} + saved = vlm.get("ollama_runner") + if saved is not None: # could be [] (skip) or a non-empty list + if isinstance(saved, list): + return saved + if isinstance(saved, str) and saved: + return shlex.split(saved) + return [] + + if not interactive_ok or not sys.stdin.isatty(): + return [] + + runner = _ask_runner() + if runner is None: + return [] + + # Persist the choice. + vlm["ollama_runner"] = runner + _atomic_save(config_path, lambda cfg: cfg.setdefault("vlm", {}).update( + {"ollama_runner": runner} + )) + if runner: + print( + f"[ollama_setup] Runner saved: {' '.join(runner)!r}", + file=sys.stderr, + ) + else: + print("[ollama_setup] Auto-pull skipped.", file=sys.stderr) + return runner + + +# ───────────────────────────────────────────────────────────────────────────── +# Model inventory and pull +# ───────────────────────────────────────────────────────────────────────────── + +def _collect_model_names(vlm_cfg: dict) -> List[Tuple[str, str]]: + """Return [(slot_name, model_id), ...] for all non-empty model slots.""" + slots = [ + ("model", "primary (Pass 2 / single-shot)"), + ("model_fast", "fast (Pass 1 + crop labels)"), + ("model_actions", "actions (Pass 3, text-only OK)"), + ("model_verify", "verify (optional, second family)"), + ] + result = [] + for key, description in slots: + val = vlm_cfg.get(key) + if val and isinstance(val, str): + result.append((val, description)) + return result + + +def _is_ollama_model(model_id: str) -> bool: + """Return False for namespaced cloud-model IDs (anthropic/, openai/, etc.) + that are not pullable via the Ollama CLI.""" + # Ollama models look like "qwen2.5vl:7b", "llama3.2-vision:11b", etc. + # Cloud models look like "anthropic/claude-3-5-sonnet", "openai/gpt-4o". + return "/" not in model_id + + +def _list_local_models(runner: List[str]) -> Set[str]: + """Return the set of model IDs currently in the local Ollama library.""" + try: + r = subprocess.run( + runner + ["list"], + capture_output=True, text=True, timeout=15, + ) + if r.returncode != 0: + return set() + lines = r.stdout.strip().splitlines() + names: Set[str] = set() + for line in lines[1:]: # skip header row + parts = line.split() + if parts: + names.add(parts[0]) # first column is NAME (with tag) + return names + except Exception: + return set() + + +def _pull_model(runner: List[str], model_id: str) -> bool: + """Pull *model_id* using *runner*, streaming output to stderr. + + Returns True on success, False on error. + """ + cmd = runner + ["pull", model_id] + print( + f"[ollama_setup] Pulling {model_id!r} ({' '.join(cmd)})", + file=sys.stderr, + ) + try: + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) + assert proc.stdout is not None + for line in proc.stdout: + line = line.rstrip() + if line: + print(f" {line}", file=sys.stderr) + proc.wait() + if proc.returncode == 0: + print(f"[ollama_setup] ✓ {model_id!r} ready.", file=sys.stderr) + return True + print( + f"[ollama_setup] ✗ Pull failed for {model_id!r} " + f"(exit {proc.returncode}) — inference will fail at runtime.", + file=sys.stderr, + ) + return False + except Exception as exc: + print( + f"[ollama_setup] ✗ Could not run pull for {model_id!r}: {exc}", + file=sys.stderr, + ) + return False + + +def ensure_models(config: dict, config_path: str, *, + interactive_ok: bool) -> None: + """Check all configured VLM model slots and pull any that are missing. + + Flow: + 1. Determine the runner prefix (ask if needed). + 2. Collect unique model IDs across all four slots. + 3. Skip cloud-namespaced IDs. + 4. Compare against locally-installed models. + 5. Pull each missing model, printing progress. + """ + vlm = config.get("vlm") or {} + if not vlm.get("enabled"): + return + + runner = ensure_runner(config, config_path, interactive_ok=interactive_ok) + if not runner: + return # user opted out or non-interactive + + pairs = _collect_model_names(vlm) + if not pairs: + return + + # Report the full model-role mapping. + print("\n[ollama_setup] Configured VLM model slots:", file=sys.stderr) + seen_ids: dict[str, str] = {} # model_id → first description seen + for model_id, description in pairs: + role = f" {description}" + if model_id in seen_ids: + role += f" (same as {seen_ids[model_id]})" + else: + seen_ids[model_id] = description + print(f" {role}: {model_id}", file=sys.stderr) + + # Filter to Ollama-pullable models only; de-duplicate. + to_check: List[str] = [] + skipped: List[str] = [] + seen: Set[str] = set() + for model_id, _ in pairs: + if model_id in seen: + continue + seen.add(model_id) + if _is_ollama_model(model_id): + to_check.append(model_id) + else: + skipped.append(model_id) + + if skipped: + print( + "[ollama_setup] Skipping cloud-model IDs (not Ollama-pullable): " + + ", ".join(repr(m) for m in skipped), + file=sys.stderr, + ) + + if not to_check: + return + + print( + f"\n[ollama_setup] Checking {len(to_check)} Ollama model(s)…", + file=sys.stderr, + ) + local = _list_local_models(runner) + + already_ok: List[str] = [] + to_pull: List[str] = [] + for model_id in to_check: + # Ollama list output uses full tag (e.g. "qwen2.5vl:7b"); also + # handle the implicit :latest suffix. + tag = model_id if ":" in model_id else model_id + ":latest" + if tag in local or model_id in local: + already_ok.append(model_id) + else: + to_pull.append(model_id) + + if already_ok: + print( + "[ollama_setup] Already available: " + + ", ".join(repr(m) for m in already_ok), + file=sys.stderr, + ) + + if not to_pull: + print("[ollama_setup] All models present — nothing to pull.", + file=sys.stderr) + return + + print( + f"[ollama_setup] Need to pull {len(to_pull)} model(s): " + + ", ".join(repr(m) for m in to_pull), + file=sys.stderr, + ) + for model_id in to_pull: + _pull_model(runner, model_id) + + +# ───────────────────────────────────────────────────────────────────────────── +# Config persistence helper +# ───────────────────────────────────────────────────────────────────────────── + +def _atomic_save(config_path: str, mutate) -> None: + """Load config_path, call mutate(cfg), write atomically via rename.""" + try: + with open(config_path, encoding="utf-8") as f: + cfg = json.load(f) + mutate(cfg) + dir_name = os.path.dirname(os.path.abspath(config_path)) or "." + fd, tmp = tempfile.mkstemp( + prefix=".config.", suffix=".json.tmp", dir=dir_name, + ) + with os.fdopen(fd, "w", encoding="utf-8") as f: + json.dump(cfg, f, indent=2, ensure_ascii=False) + f.write("\n") + os.replace(tmp, config_path) + except Exception as exc: + print(f"[ollama_setup] Could not save config: {exc}", file=sys.stderr) diff --git a/tests/test_ollama_setup.py b/tests/test_ollama_setup.py new file mode 100644 index 0000000..4b99e82 --- /dev/null +++ b/tests/test_ollama_setup.py @@ -0,0 +1,237 @@ +"""Tests for ollama_setup — runner detection, model inventory, and pull.""" +from __future__ import annotations + +import json +from unittest.mock import MagicMock, patch + +from ollama_setup import ( + _collect_model_names, + _is_ollama_model, + _list_local_models, + ensure_models, + ensure_runner, +) + + +# ─── _is_ollama_model ──────────────────────────────────────────────────────── + +def test_is_ollama_model_plain(): + assert _is_ollama_model("qwen2.5vl:7b") is True + assert _is_ollama_model("llama3.2-vision:11b") is True + assert _is_ollama_model("minicpm-v:8b") is True + assert _is_ollama_model("moondream:latest") is True + + +def test_is_ollama_model_cloud_namespace(): + assert _is_ollama_model("anthropic/claude-3-5-sonnet") is False + assert _is_ollama_model("openai/gpt-4o") is False + assert _is_ollama_model("meta/llama3") is False + + +# ─── _collect_model_names ──────────────────────────────────────────────────── + +def test_collect_model_names_all_slots(): + vlm = { + "model": "qwen2.5vl:7b", + "model_fast": "qwen2.5vl:3b", + "model_actions": "qwen2.5:14b", + "model_verify": "llama3.2-vision:11b", + } + names = [m for m, _ in _collect_model_names(vlm)] + assert names == ["qwen2.5vl:7b", "qwen2.5vl:3b", "qwen2.5:14b", + "llama3.2-vision:11b"] + + +def test_collect_model_names_empty_slots_skipped(): + vlm = {"model": "qwen2.5vl:7b", "model_fast": None, "model_verify": ""} + names = [m for m, _ in _collect_model_names(vlm)] + assert names == ["qwen2.5vl:7b"] + + +def test_collect_model_names_no_slots(): + assert _collect_model_names({}) == [] + + +# ─── _list_local_models ────────────────────────────────────────────────────── + +_OLLAMA_LIST_OUTPUT = """\ +NAME ID SIZE MODIFIED +qwen2.5vl:7b abc123 5.0 GB 2 days ago +qwen2.5vl:3b def456 2.1 GB 2 days ago +llama3.2-vision:11b ghi789 8.0 GB 1 week ago +""" + + +def _mk_run(returncode=0, stdout=""): + mock = MagicMock() + mock.returncode = returncode + mock.stdout = stdout + return mock + + +def test_list_local_models_parses_ollama_output(): + with patch("subprocess.run", return_value=_mk_run(0, _OLLAMA_LIST_OUTPUT)): + result = _list_local_models(["ollama"]) + assert "qwen2.5vl:7b" in result + assert "qwen2.5vl:3b" in result + assert "llama3.2-vision:11b" in result + + +def test_list_local_models_returns_empty_on_error(): + with patch("subprocess.run", return_value=_mk_run(1, "")): + result = _list_local_models(["ollama"]) + assert result == set() + + +def test_list_local_models_passes_correct_command(): + captured = {} + def fake_run(cmd, **kw): + captured["cmd"] = cmd + return _mk_run(0, _OLLAMA_LIST_OUTPUT) + with patch("subprocess.run", side_effect=fake_run): + _list_local_models(["docker", "exec", "mybox", "ollama"]) + assert captured["cmd"] == ["docker", "exec", "mybox", "ollama", "list"] + + +# ─── ensure_runner ─────────────────────────────────────────────────────────── + +def _cfg_with_runner(tmp_path, runner_value): + cfg = {"vlm": {"enabled": True, "ollama_runner": runner_value}} + p = tmp_path / "config.json" + p.write_text(json.dumps(cfg), encoding="utf-8") + return cfg, str(p) + + +def test_ensure_runner_returns_saved_list(tmp_path): + cfg, path = _cfg_with_runner(tmp_path, ["ollama"]) + result = ensure_runner(cfg, path, interactive_ok=False) + assert result == ["ollama"] + + +def test_ensure_runner_returns_empty_list_when_skip_saved(tmp_path): + cfg, path = _cfg_with_runner(tmp_path, []) + result = ensure_runner(cfg, path, interactive_ok=False) + assert result == [] + + +def test_ensure_runner_non_interactive_returns_empty_when_unset(tmp_path): + cfg = {"vlm": {"enabled": True}} + p = tmp_path / "config.json" + p.write_text(json.dumps(cfg), encoding="utf-8") + result = ensure_runner(cfg, str(p), interactive_ok=False) + assert result == [] + + +def test_ensure_runner_parses_string_runner(tmp_path): + cfg, path = _cfg_with_runner(tmp_path, "docker exec mybox ollama") + result = ensure_runner(cfg, path, interactive_ok=False) + assert result == ["docker", "exec", "mybox", "ollama"] + + +# ─── ensure_models ─────────────────────────────────────────────────────────── + +def _cfg_full(tmp_path, **vlm_extra): + vlm = { + "enabled": True, + "model": "qwen2.5vl:7b", + "model_fast": "qwen2.5vl:3b", + "model_actions": None, + "model_verify": None, + "ollama_runner": ["ollama"], + **vlm_extra, + } + cfg = {"vlm": vlm} + p = tmp_path / "config.json" + p.write_text(json.dumps(cfg), encoding="utf-8") + return cfg, str(p) + + +def test_ensure_models_skips_when_disabled(tmp_path): + cfg = {"vlm": {"enabled": False}} + p = tmp_path / "config.json" + p.write_text(json.dumps(cfg), encoding="utf-8") + # Should not call subprocess at all. + with patch("subprocess.run") as mock_run: + ensure_models(cfg, str(p), interactive_ok=False) + mock_run.assert_not_called() + + +def test_ensure_models_skips_when_runner_empty(tmp_path): + cfg, path = _cfg_full(tmp_path, ollama_runner=[]) + with patch("subprocess.run") as mock_run: + ensure_models(cfg, path, interactive_ok=False) + mock_run.assert_not_called() + + +def test_ensure_models_pulls_missing(tmp_path): + cfg, path = _cfg_full(tmp_path) + # Local Ollama has qwen2.5vl:7b but not qwen2.5vl:3b. + local_output = "NAME\nqwen2.5vl:7b abc 5GB 1d\n" + + pull_calls = [] + + def fake_popen(cmd, **kw): + pull_calls.append(cmd) + proc = MagicMock() + proc.stdout = iter(["pulling manifest\n", "success\n"]) + proc.returncode = 0 + proc.wait.return_value = 0 + return proc + + with patch("subprocess.run", return_value=_mk_run(0, local_output)), \ + patch("subprocess.Popen", side_effect=fake_popen): + ensure_models(cfg, path, interactive_ok=False) + + assert any("qwen2.5vl:3b" in str(c) for c in pull_calls), \ + f"Expected pull for qwen2.5vl:3b; got {pull_calls}" + # Should NOT pull qwen2.5vl:7b (already present). + for c in pull_calls: + assert "qwen2.5vl:7b" not in str(c) or "3b" in str(c), \ + f"Unexpected pull for qwen2.5vl:7b: {c}" + + +def test_ensure_models_nothing_to_pull(tmp_path): + cfg, path = _cfg_full(tmp_path) + local_output = ( + "NAME\n" + "qwen2.5vl:7b abc 5GB 1d\n" + "qwen2.5vl:3b def 2GB 1d\n" + ) + with patch("subprocess.run", return_value=_mk_run(0, local_output)), \ + patch("subprocess.Popen") as mock_popen: + ensure_models(cfg, path, interactive_ok=False) + mock_popen.assert_not_called() + + +def test_ensure_models_skips_cloud_models(tmp_path): + cfg, path = _cfg_full(tmp_path, + model="anthropic/claude-3-5-sonnet", + model_fast="openai/gpt-4o") + with patch("subprocess.run", return_value=_mk_run(0, "NAME\n")), \ + patch("subprocess.Popen") as mock_popen: + ensure_models(cfg, path, interactive_ok=False) + # Cloud-namespaced models are not Ollama-pullable — no pull attempted. + mock_popen.assert_not_called() + + +def test_ensure_models_deduplicates(tmp_path): + # model and model_fast are the same — pull only once. + cfg, path = _cfg_full(tmp_path, + model="qwen2.5vl:7b", + model_fast="qwen2.5vl:7b") + pull_calls = [] + + def fake_popen(cmd, **kw): + pull_calls.append(cmd) + proc = MagicMock() + proc.stdout = iter([]) + proc.returncode = 0 + proc.wait.return_value = 0 + return proc + + with patch("subprocess.run", return_value=_mk_run(0, "NAME\n")), \ + patch("subprocess.Popen", side_effect=fake_popen): + ensure_models(cfg, path, interactive_ok=False) + + # Only one pull call, not two. + assert len(pull_calls) == 1 From b65a77dc5e275f8df486149e597b97576112408f Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 10:43:36 +0000 Subject: [PATCH 3/7] Make --mode inspect the default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flips the argparse default from 'both' to 'inspect' so first-run users get the interactive VLM model picker and Ollama auto-pull. In 'mcp' and 'both' modes stdin is owned by the MCP framing channel, so the picker silently disables VLM if no model is configured — the new default ensures the setup paths added in the prior commits actually fire. vlm.enabled in config.json.example is already true, so out-of-the-box a fresh checkout runs: load config → ask for the Ollama runner → pull configured models → start the web inspector on http://127.0.0.1:5001. Updates the docstring, argparse help, and README examples to match. The Claude Desktop MCP integration block in README still shows --mode both since that's the only mode that makes sense in that context. https://claude.ai/code/session_01VhYzhCbZ5qvmBThCH8cxLD --- README.md | 5 +++-- main.py | 19 ++++++++++++------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 06eb056..5f7e0e4 100644 --- a/README.md +++ b/README.md @@ -44,9 +44,10 @@ OSScreenObserver exposes a full REST API on port `5001` (configurable). Most `/a ### Startup modes ```bash -python main.py --mode inspect # HTTP server only (web UI + REST API) +python main.py # Default: HTTP server only (web UI + REST API, interactive VLM setup) python main.py --mode both # REST API + MCP stdio simultaneously -python main.py --mock --mode inspect # Mock mode with synthetic data (no OS access) +python main.py --mode mcp # MCP stdio only +python main.py --mock # Mock mode with synthetic data (no OS access) python main.py --mock --scenario scenarios_examples/login.yaml # Scenario-driven mock ``` diff --git a/main.py b/main.py index 184667b..652459d 100644 --- a/main.py +++ b/main.py @@ -3,9 +3,13 @@ Usage ───── - # Both MCP server (stdio) + web inspector (port 5001) simultaneously + # Web inspector only (port 5001) — the default; stdin is free so VLM + # model setup and Ollama auto-pull can prompt interactively on first run. python main.py + # Both MCP server (stdio) + web inspector simultaneously + python main.py --mode both + # Web inspector only (useful for manual exploration) python main.py --mode inspect @@ -171,15 +175,16 @@ def build_parser() -> argparse.ArgumentParser: formatter_class = argparse.RawDescriptionHelpFormatter, epilog = """ examples: - python main.py # both MCP + web UI - python main.py --mode inspect # web UI only - python main.py --mode mcp # MCP stdio only + python main.py # web UI only (default) + python main.py --mode both # MCP stdio + web UI + python main.py --mode mcp # MCP stdio only python main.py --mock # mock data (no OS access needed) - python main.py --mock --mode inspect --port 8080 + python main.py --mock --port 8080 """, ) - p.add_argument("--mode", choices=["mcp", "inspect", "both"], default="both", - help="Run mode (default: both)") + p.add_argument("--mode", choices=["mcp", "inspect", "both"], default="inspect", + help="Run mode (default: inspect — interactive VLM setup " + "runs only in this mode)") p.add_argument("--config", default="config.json", help="Path to JSON config file (default: config.json)") p.add_argument("--mock", action="store_true", From 0ec290fdc9646c405386cc9d84828b0860de86ec Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 10:47:29 +0000 Subject: [PATCH 4/7] Add platform launcher scripts: start.sh, start-mac.sh, start.bat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each launcher detects missing dependencies, prompts before installing, sets up a .venv, installs requirements.txt, and starts the server in the default mode (inspect — interactive VLM setup runs only in this mode). Dependencies checked per platform: Linux: python3 (+ python3-venv/pip on apt), tesseract-ocr, wmctrl, ollama (via official install.sh). Uses apt/dnf/pacman/zypper as appropriate. macOS: Homebrew (offers to install if missing), python3, tesseract, ollama. Prints the Accessibility/Screen Recording permission note that AX adapter + screenshot capture both require. Windows: Python 3.12, Tesseract (UB-Mannheim build), Ollama via winget on Windows 10 1809+ / Windows 11. Falls back to printing the download URLs on older systems. Each prompt defaults to Yes (Enter to accept) so the happy path needs no typing. Skipping any prompt continues without that capability — for example skipping Ollama is fine if the user plans to point vlm.base_url at a remote endpoint or run with vlm.enabled=false. Adds a "Quick start — automated launchers" section to README pointing at the three scripts, ahead of the existing manual-install instructions which remain for users who prefer to manage their own environment. https://claude.ai/code/session_01VhYzhCbZ5qvmBThCH8cxLD --- README.md | 25 +++++++++ start-mac.sh | 131 +++++++++++++++++++++++++++++++++++++++++++++ start.bat | 131 +++++++++++++++++++++++++++++++++++++++++++++ start.sh | 147 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 434 insertions(+) create mode 100755 start-mac.sh create mode 100644 start.bat create mode 100755 start.sh diff --git a/README.md b/README.md index 5f7e0e4..5493052 100644 --- a/README.md +++ b/README.md @@ -126,6 +126,31 @@ The REST API endpoints map directly to the `SCREEN_TOOLS` OpenAI/OpenWebUI funct ## Installation +### Quick start — automated launchers + +The fastest path is the platform launcher, which detects missing +dependencies (Python, Tesseract, Ollama, wmctrl on Linux), asks before +installing each one, sets up a `.venv/`, installs `requirements.txt`, and +starts the server: + +```bash +# Linux +./start.sh + +# macOS +./start-mac.sh + +# Windows (Command Prompt or PowerShell) +start.bat +``` + +The scripts use `winget` on Windows, Homebrew on macOS, and the native +package manager on Linux (apt / dnf / pacman / zypper). Skip any prompt +to install manually later; the launcher will still bring up whatever is +already working. + +For a manual install, follow the steps below. + ### 1. Python environment ```bash diff --git a/start-mac.sh b/start-mac.sh new file mode 100755 index 0000000..ac683b3 --- /dev/null +++ b/start-mac.sh @@ -0,0 +1,131 @@ +#!/usr/bin/env bash +# start-mac.sh — macOS launcher for OSScreenObserver. +# +# Detects missing system + Python dependencies, prompts before installing, +# then starts the server in the default mode (inspect — interactive VLM +# setup runs only in this mode). + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# ─── helpers ───────────────────────────────────────────────────────────────── + +confirm() { + local prompt="$1" + local reply + read -r -p "$prompt [Y/n] " reply + case "$reply" in + ""|y|Y|yes|YES) return 0 ;; + *) return 1 ;; + esac +} + +have() { command -v "$1" >/dev/null 2>&1; } + +echo "═══════════════════════════════════════════════════════════════" +echo " OSScreenObserver — macOS launcher" +echo "═══════════════════════════════════════════════════════════════" + +# ─── Homebrew (prerequisite for everything else) ───────────────────────────── + +if ! have brew; then + echo " ✗ Homebrew not found — required to install tesseract / ollama / python." + echo " See https://brew.sh" + if confirm "Install Homebrew now?"; then + /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" + # Add brew to PATH for this shell (Apple Silicon vs Intel). + if [[ -x /opt/homebrew/bin/brew ]]; then + eval "$(/opt/homebrew/bin/brew shellenv)" + elif [[ -x /usr/local/bin/brew ]]; then + eval "$(/usr/local/bin/brew shellenv)" + fi + else + echo " Continuing without brew — you must install dependencies manually." + fi +else + echo " ✓ brew → $(brew --version | head -n1)" +fi + +# ─── Python 3 ──────────────────────────────────────────────────────────────── + +if ! have python3; then + echo " ✗ python3 not found." + if have brew && confirm "Install Python 3 via Homebrew?"; then + brew install python + else + echo " Aborting — Python 3 is required." + exit 1 + fi +fi +echo " ✓ python3 → $(python3 --version)" + +# ─── Tesseract (OCR) ───────────────────────────────────────────────────────── + +if ! have tesseract; then + echo " ✗ tesseract not found — OCR will be unavailable." + if have brew && confirm "Install tesseract via Homebrew?"; then + brew install tesseract + fi +else + echo " ✓ tesseract → $(tesseract --version 2>&1 | head -n1)" +fi + +# ─── Ollama (optional — for local VLM) ─────────────────────────────────────── + +if ! have ollama; then + echo " ⓘ ollama not found — required only if you want a local VLM." + if have brew && confirm "Install Ollama via Homebrew?"; then + brew install ollama + echo " (start the Ollama service in a separate shell: 'ollama serve')" + else + echo " (skipping; either set vlm.enabled=false in config.json or" + echo " point vlm.base_url at a remote endpoint)" + fi +else + echo " ✓ ollama → $(ollama --version 2>/dev/null | head -n1)" +fi + +# ─── Python virtualenv + pip install ───────────────────────────────────────── + +VENV_DIR="${SCRIPT_DIR}/.venv" +if [[ ! -d "$VENV_DIR" ]]; then + if confirm "Create a project virtualenv at .venv/?"; then + python3 -m venv "$VENV_DIR" + else + echo " (using system Python; you may want a venv to avoid clobbering system packages)" + fi +fi + +if [[ -d "$VENV_DIR" ]]; then + # shellcheck disable=SC1091 + source "$VENV_DIR/bin/activate" + echo " ✓ activated virtualenv .venv/" +fi + +if confirm "Install/upgrade Python dependencies from requirements.txt?"; then + python3 -m pip install --upgrade pip + python3 -m pip install -r requirements.txt + if confirm "Also install pyobjc for full macOS accessibility-tree support?"; then + python3 -m pip install pyobjc + fi +fi + +# ─── macOS accessibility-permissions note ──────────────────────────────────── + +cat <<'EOF' + + ⓘ macOS requires Accessibility + Screen Recording permissions for the + AX adapter and screenshot capture. The first run will trigger a + permission prompt; grant Terminal/iTerm/your shell host in + System Settings → Privacy & Security → Accessibility and Screen Recording. + +EOF + +# ─── Launch ────────────────────────────────────────────────────────────────── + +echo " Starting OSScreenObserver (default mode: inspect)…" +echo " Web UI → http://127.0.0.1:5001" +echo "" +exec python3 main.py "$@" diff --git a/start.bat b/start.bat new file mode 100644 index 0000000..fbc2d33 --- /dev/null +++ b/start.bat @@ -0,0 +1,131 @@ +@echo off +REM ═══════════════════════════════════════════════════════════════════════════ +REM start.bat — Windows launcher for OSScreenObserver. +REM +REM Detects missing dependencies, prompts before installing, then starts the +REM server in the default mode (inspect — interactive VLM setup runs only +REM in this mode). +REM +REM Uses winget when available (Windows 10 1809+ / Windows 11) to install +REM Python, Tesseract, and Ollama. Falls back to printing the download URL +REM on older systems. +REM ═══════════════════════════════════════════════════════════════════════════ + +setlocal EnableDelayedExpansion +cd /d "%~dp0" + +echo =============================================================== +echo OSScreenObserver - Windows launcher +echo =============================================================== + +REM ─── Detect winget ────────────────────────────────────────────────────────── + +set "HAS_WINGET=0" +where winget >nul 2>&1 && set "HAS_WINGET=1" + +REM ─── Python ───────────────────────────────────────────────────────────────── + +set "PY_CMD=" +where python >nul 2>&1 && set "PY_CMD=python" +if "%PY_CMD%"=="" where py >nul 2>&1 && set "PY_CMD=py -3" + +if "%PY_CMD%"=="" ( + echo [x] Python 3 was not found on PATH. + if "%HAS_WINGET%"=="1" ( + call :CONFIRM "Install Python 3.12 via winget?" + if !ANSWER!==Y ( + winget install -e --id Python.Python.3.12 --accept-package-agreements --accept-source-agreements + echo Please close and re-open this terminal so PATH picks up Python, then re-run start.bat. + pause + exit /b 0 + ) + ) else ( + echo winget is unavailable. Install Python from https://www.python.org/downloads/ + ) + echo Aborting - Python 3 is required. + pause + exit /b 1 +) +for /f "tokens=*" %%V in ('%PY_CMD% --version 2^>^&1') do set "PY_VER=%%V" +echo [+] Python -^> %PY_VER% + +REM ─── Tesseract (OCR) ──────────────────────────────────────────────────────── + +set "HAS_TESS=0" +where tesseract >nul 2>&1 && set "HAS_TESS=1" +if not exist "%ProgramFiles%\Tesseract-OCR\tesseract.exe" ( + if "%HAS_TESS%"=="0" ( + echo [x] tesseract not found - OCR will be unavailable. + if "%HAS_WINGET%"=="1" ( + call :CONFIRM "Install Tesseract via winget?" + if !ANSWER!==Y ( + winget install -e --id UB-Mannheim.TesseractOCR --accept-package-agreements --accept-source-agreements + echo After install, set ocr.tesseract_cmd in config.json to the full path: + echo "C:/Program Files/Tesseract-OCR/tesseract.exe" + ) + ) else ( + echo Install from https://github.com/UB-Mannheim/tesseract/wiki + echo then set ocr.tesseract_cmd in config.json to the installed path. + ) + ) +) else ( + echo [+] tesseract present at "%ProgramFiles%\Tesseract-OCR\tesseract.exe" +) + +REM ─── Ollama (optional - for local VLM) ────────────────────────────────────── + +where ollama >nul 2>&1 +if errorlevel 1 ( + echo [i] ollama not found - required only if you want a local VLM. + if "%HAS_WINGET%"=="1" ( + call :CONFIRM "Install Ollama via winget?" + if !ANSWER!==Y ( + winget install -e --id Ollama.Ollama --accept-package-agreements --accept-source-agreements + ) + ) else ( + echo Install from https://ollama.com/download/windows + ) +) else ( + for /f "tokens=*" %%V in ('ollama --version 2^>nul') do set "OLLAMA_VER=%%V" + echo [+] ollama -^> !OLLAMA_VER! +) + +REM ─── Python virtualenv + pip install ──────────────────────────────────────── + +set "VENV_DIR=%CD%\.venv" +if not exist "%VENV_DIR%\Scripts\activate.bat" ( + call :CONFIRM "Create a project virtualenv at .venv\?" + if !ANSWER!==Y ( + %PY_CMD% -m venv "%VENV_DIR%" + ) +) + +if exist "%VENV_DIR%\Scripts\activate.bat" ( + call "%VENV_DIR%\Scripts\activate.bat" + echo [+] activated virtualenv .venv\ + set "PY_CMD=python" +) + +call :CONFIRM "Install/upgrade Python dependencies from requirements.txt?" +if !ANSWER!==Y ( + %PY_CMD% -m pip install --upgrade pip + %PY_CMD% -m pip install -r requirements.txt +) + +REM ─── Launch ───────────────────────────────────────────────────────────────── + +echo. +echo Starting OSScreenObserver (default mode: inspect)... +echo Web UI -^> http://127.0.0.1:5001 +echo. +%PY_CMD% main.py %* +exit /b %ERRORLEVEL% + +REM ─── :CONFIRM subroutine — sets ANSWER=Y or ANSWER=N ─────────────────────── +:CONFIRM +set "ANSWER=N" +set /p "REPLY=%~1 [Y/n] " +if /i "%REPLY%"=="" set "ANSWER=Y" +if /i "%REPLY%"=="y" set "ANSWER=Y" +if /i "%REPLY%"=="yes" set "ANSWER=Y" +exit /b 0 diff --git a/start.sh b/start.sh new file mode 100755 index 0000000..c1f023e --- /dev/null +++ b/start.sh @@ -0,0 +1,147 @@ +#!/usr/bin/env bash +# start.sh — Linux launcher for OSScreenObserver. +# +# Detects missing system + Python dependencies, prompts before installing, +# then starts the server in the default mode (inspect — interactive VLM +# setup runs only in this mode). + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# ─── helpers ───────────────────────────────────────────────────────────────── + +confirm() { + # confirm "prompt text" — returns 0 on y/Y/, 1 otherwise. + local prompt="$1" + local reply + read -r -p "$prompt [Y/n] " reply + case "$reply" in + ""|y|Y|yes|YES) return 0 ;; + *) return 1 ;; + esac +} + +have() { command -v "$1" >/dev/null 2>&1; } + +detect_pkg_manager() { + if have apt-get; then echo "apt" + elif have dnf; then echo "dnf" + elif have pacman; then echo "pacman" + elif have zypper; then echo "zypper" + else echo "" + fi +} + +install_pkg() { + # install_pkg + local pm; pm="$(detect_pkg_manager)" + case "$pm" in + apt) sudo apt-get update && sudo apt-get install -y "$1" ;; + dnf) sudo dnf install -y "$2" ;; + pacman) sudo pacman -S --noconfirm "$3" ;; + zypper) sudo zypper install -y "$4" ;; + *) + echo " ✗ No supported package manager found." >&2 + echo " Install manually: apt='$1', dnf='$2', pacman='$3', zypper='$4'" >&2 + return 1 + ;; + esac +} + +echo "═══════════════════════════════════════════════════════════════" +echo " OSScreenObserver — Linux launcher" +echo "═══════════════════════════════════════════════════════════════" + +# ─── Python ────────────────────────────────────────────────────────────────── + +if ! have python3; then + echo " ✗ Python 3 is required but was not found on PATH." + if confirm "Install Python 3 now?"; then + install_pkg python3 python3 python python3 + else + echo " Aborting — Python 3 is required." + exit 1 + fi +fi +echo " ✓ python3 → $(python3 --version)" + +# Debian/Ubuntu split venv + pip out of the python3 metapackage; install +# them explicitly when missing. On dnf/pacman/zypper these ship with python3. +if [[ "$(detect_pkg_manager)" == "apt" ]]; then + if ! python3 -c "import venv" >/dev/null 2>&1; then + confirm "Install python3-venv (required to create the .venv)?" && \ + sudo apt-get install -y python3-venv + fi + if ! python3 -m pip --version >/dev/null 2>&1; then + confirm "Install python3-pip?" && sudo apt-get install -y python3-pip + fi +fi + +# ─── Tesseract (OCR) ───────────────────────────────────────────────────────── + +if ! have tesseract; then + echo " ✗ tesseract not found — OCR will be unavailable." + if confirm "Install tesseract-ocr (system package)?"; then + install_pkg tesseract-ocr tesseract tesseract tesseract-ocr || \ + echo " (continuing without OCR; description.from_ocr will report missing binary)" + fi +else + echo " ✓ tesseract → $(tesseract --version 2>&1 | head -n1)" +fi + +# ─── wmctrl (window enumeration) ───────────────────────────────────────────── + +if ! have wmctrl; then + echo " ✗ wmctrl not found — window enumeration will fall back to python-xlib." + if confirm "Install wmctrl?"; then + install_pkg wmctrl wmctrl wmctrl wmctrl || true + fi +else + echo " ✓ wmctrl present" +fi + +# ─── Ollama (optional — for local VLM) ─────────────────────────────────────── + +if ! have ollama; then + echo " ⓘ ollama not found — required only if you want a local VLM." + if confirm "Install Ollama via the official install script?"; then + curl -fsSL https://ollama.com/install.sh | sh + else + echo " (skipping; either set vlm.enabled=false in config.json or" + echo " point vlm.base_url at a remote endpoint)" + fi +else + echo " ✓ ollama → $(ollama --version 2>/dev/null | head -n1)" +fi + +# ─── Python virtualenv + pip install ───────────────────────────────────────── + +VENV_DIR="${SCRIPT_DIR}/.venv" +if [[ ! -d "$VENV_DIR" ]]; then + if confirm "Create a project virtualenv at .venv/?"; then + python3 -m venv "$VENV_DIR" + else + echo " (using system Python; you may want a venv to avoid clobbering system packages)" + fi +fi + +if [[ -d "$VENV_DIR" ]]; then + # shellcheck disable=SC1091 + source "$VENV_DIR/bin/activate" + echo " ✓ activated virtualenv .venv/" +fi + +if confirm "Install/upgrade Python dependencies from requirements.txt?"; then + python3 -m pip install --upgrade pip + python3 -m pip install -r requirements.txt +fi + +# ─── Launch ────────────────────────────────────────────────────────────────── + +echo "" +echo " Starting OSScreenObserver (default mode: inspect)…" +echo " Web UI → http://127.0.0.1:5001" +echo "" +exec python3 main.py "$@" From 0705138d77d0ca96611b215f32b672e57914f780 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 10:49:48 +0000 Subject: [PATCH 5/7] Add --mode auto (new default): pick inspect vs both from launch env MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the choice between 'inspect' (web UI + interactive VLM setup, no MCP) and 'both' (MCP + web, no interactive setup) had to be made by the user — and each had a silent failure mode in the wrong environment: • 'both' from a TTY: the MCP server blocks reading framing bytes from a keyboard nobody types on, and the VLM picker is suppressed because stdin "belongs" to MCP. • 'inspect' from a pipe (Claude Desktop launches us): an MCP client waiting on stdout never sees a framed message. The new 'auto' mode (now the default) resolves this at startup by checking sys.stdin.isatty(): • TTY → runs as 'inspect'. Interactive VLM setup + Ollama auto-pull prompts work; we don't start MCP because it would just block. This is the graceful fallback for the missing capability (MCP framing). • Non-TTY → runs as 'both'. MCP framing has a real client on stdio; interactive setup is skipped (config-driven model picks still apply). The web inspector also comes up on :5001. Either branch logs a one-line stderr notice naming the resolved mode and the reason. The explicit modes (mcp/inspect/both) still work for users who want to force one regardless of the environment. https://claude.ai/code/session_01VhYzhCbZ5qvmBThCH8cxLD --- README.md | 7 ++++--- main.py | 51 ++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 46 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 5493052..b407e98 100644 --- a/README.md +++ b/README.md @@ -44,9 +44,10 @@ OSScreenObserver exposes a full REST API on port `5001` (configurable). Most `/a ### Startup modes ```bash -python main.py # Default: HTTP server only (web UI + REST API, interactive VLM setup) -python main.py --mode both # REST API + MCP stdio simultaneously -python main.py --mode mcp # MCP stdio only +python main.py # Default: auto — TTY → inspect (web UI + interactive setup); piped (Claude Desktop) → both +python main.py --mode both # Force REST API + MCP stdio simultaneously +python main.py --mode inspect # Force web UI only +python main.py --mode mcp # Force MCP stdio only python main.py --mock # Mock mode with synthetic data (no OS access) python main.py --mock --scenario scenarios_examples/login.yaml # Scenario-driven mock ``` diff --git a/main.py b/main.py index 652459d..8275423 100644 --- a/main.py +++ b/main.py @@ -3,14 +3,16 @@ Usage ───── - # Web inspector only (port 5001) — the default; stdin is free so VLM - # model setup and Ollama auto-pull can prompt interactively on first run. + # Auto — the default. Detects the launch environment: + # • TTY (interactive terminal) → runs as `inspect` so the VLM picker + # and Ollama auto-pull can prompt, and so the MCP server isn't left + # blocked reading from your keyboard. + # • Non-TTY (Claude Desktop pipes us, CI, headless service) → runs as + # `both`, exposing MCP on stdio and the web inspector at :5001. python main.py - # Both MCP server (stdio) + web inspector simultaneously + # Force a specific mode (overrides the auto detection) python main.py --mode both - - # Web inspector only (useful for manual exploration) python main.py --mode inspect # MCP stdio only (useful when launched by Claude Desktop) @@ -175,16 +177,19 @@ def build_parser() -> argparse.ArgumentParser: formatter_class = argparse.RawDescriptionHelpFormatter, epilog = """ examples: - python main.py # web UI only (default) + python main.py # auto-detect (TTY → inspect, pipe → both) python main.py --mode both # MCP stdio + web UI + python main.py --mode inspect # web UI only python main.py --mode mcp # MCP stdio only python main.py --mock # mock data (no OS access needed) python main.py --mock --port 8080 """, ) - p.add_argument("--mode", choices=["mcp", "inspect", "both"], default="inspect", - help="Run mode (default: inspect — interactive VLM setup " - "runs only in this mode)") + p.add_argument("--mode", + choices=["auto", "mcp", "inspect", "both"], default="auto", + help="Run mode (default: auto — picks 'inspect' when stdin is a " + "TTY so interactive VLM setup can run, else 'both' so " + "MCP framing works on stdio)") p.add_argument("--config", default="config.json", help="Path to JSON config file (default: config.json)") p.add_argument("--mock", action="store_true", @@ -227,6 +232,34 @@ def main() -> None: setup_logging(config) logger = logging.getLogger("main") + # ── Auto-mode resolution ──────────────────────────────────────────────── + # The legacy three modes (mcp/inspect/both) each have a failure mode if + # run in the wrong environment: + # + # • `both` from a TTY → MCP server blocks reading framing bytes + # from a keyboard nobody types on, and the + # VLM picker is suppressed because stdin + # "belongs" to MCP. + # • `inspect` from a pipe → an MCP client (Claude Desktop) waiting + # on stdout never sees a framed message. + # + # `auto` notifies the user, picks the right concrete mode for the + # actual launch environment, and stays out of the way for users who + # passed --mode explicitly. + if args.mode == "auto": + if sys.stdin.isatty(): + args.mode = "inspect" + print("[screen_observer] auto-mode → 'inspect' " + "(TTY detected; MCP needs a piped framing channel, " + "so it would block here — falling back gracefully)", + file=sys.stderr) + else: + args.mode = "both" + print("[screen_observer] auto-mode → 'both' " + "(no TTY; assuming an MCP client is on stdio; " + "interactive VLM setup will be skipped)", + file=sys.stderr) + # ── VLM model setup ────────────────────────────────────────────────────── # In `inspect` mode stdin is free, so we can prompt the operator to pick # a model and persist the choice. In `mcp`/`both` mode stdin is owned by From 44d0e9ba77839a2cf1685f551c6e7f4e2c48c0e7 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 10:52:29 +0000 Subject: [PATCH 6/7] Bootstrap config.json and auto-fix broken tesseract path in start scripts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds setup_config.py — a small one-shot helper that each launcher now runs after the tesseract install step: 1. If config.json does not exist, copies it from config.json.example so the user has a real file (gitignored) to edit, rather than relying on main.py's lazier bootstrap which fires only when load_config reads the missing file. 2. Checks ocr.tesseract_cmd. The bundled example ships the Windows path 'c:\Program Files\Tesseract-OCR\tesseract.exe', which on Linux/macOS is definitely wrong. If the configured path doesn't exist (or is unset AND tesseract is not on PATH), searches PATH plus common install locations ('/usr/bin', '/usr/local/bin', '/opt/homebrew/bin', the Windows Program Files variants) and offers to update the config to point at the discovered binary. Y-by-default prompt, atomic write via tempfile+rename, and no-ops silently when stdin is not a TTY or when tesseract is truly missing (the launcher's install step has already warned about that case). start.sh, start-mac.sh, and start.bat all call 'python setup_config.py' right before launching main.py. Adds tests/test_setup_config.py covering all six branches (configured-and-exists, unset-but-on-PATH, broken, user-declined, totally-missing, missing-ocr-section) plus a check that the atomic save leaves no .tmp leftovers. On the related question of whether --mode should default to 'both' instead: no. 'auto' (the current default) already picks 'both' whenever stdin is not a TTY — exactly the environment in which 'both' makes sense. Forcing 'both' as the default would re-introduce the silent TTY-blocked-on-stdin failure that 'auto' was added to avoid. https://claude.ai/code/session_01VhYzhCbZ5qvmBThCH8cxLD --- setup_config.py | 170 +++++++++++++++++++++++++++++++++++++ start-mac.sh | 6 +- start.bat | 6 +- start.sh | 6 +- tests/test_setup_config.py | 132 ++++++++++++++++++++++++++++ 5 files changed, 317 insertions(+), 3 deletions(-) create mode 100644 setup_config.py create mode 100644 tests/test_setup_config.py diff --git a/setup_config.py b/setup_config.py new file mode 100644 index 0000000..d96c4cd --- /dev/null +++ b/setup_config.py @@ -0,0 +1,170 @@ +""" +setup_config.py — Interactive one-shot config fixups for the start scripts. + +Called by start.sh / start-mac.sh / start.bat after the tesseract install +step. Does exactly two things, no more: + + 1. If config.json does not exist, copy it from config.json.example so + the user has a real file to edit (gitignored — config.json.example + is the source of truth). + + 2. Check ocr.tesseract_cmd. If it is set but points to a path that does + not exist (the bundled example ships the Windows path, which is wrong + on Linux/macOS), search PATH and a few common install locations, + then offer to write the discovered path back to config.json. + +Everything is opt-in via a Y/n prompt. If the prompts are skipped or the +script is run non-interactively (no TTY), no changes are made. +""" + +from __future__ import annotations + +import json +import os +import shutil +import sys +import tempfile +from typing import Optional + + +_COMMON_TESSERACT_PATHS = [ + # Linux + "/usr/bin/tesseract", + "/usr/local/bin/tesseract", + # macOS — Homebrew on Apple Silicon and Intel + "/opt/homebrew/bin/tesseract", + # Windows (the script will normalise PATH separators) + r"C:\Program Files\Tesseract-OCR\tesseract.exe", + r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe", +] + + +def _confirm(prompt: str) -> bool: + """Y-by-default Y/n prompt. False on EOF or non-TTY.""" + if not sys.stdin.isatty(): + return False + try: + reply = input(f"{prompt} [Y/n] ").strip().lower() + except (EOFError, KeyboardInterrupt): + return False + return reply in ("", "y", "yes") + + +def _find_tesseract_on_path() -> Optional[str]: + """which/where tesseract — return absolute path or None.""" + path = shutil.which("tesseract") + if path: + return path + # On Windows shutil.which may need the .exe suffix explicitly. + return shutil.which("tesseract.exe") + + +def _find_tesseract() -> Optional[str]: + """Return the first existing tesseract path: PATH first, then well-known.""" + p = _find_tesseract_on_path() + if p and os.path.exists(p): + return p + for candidate in _COMMON_TESSERACT_PATHS: + if os.path.exists(candidate): + return candidate + return None + + +def _atomic_write_json(path: str, data: dict) -> None: + """Write *data* to *path* via temp-file + rename so a Ctrl-C mid-flush + cannot truncate the user's config.""" + dir_name = os.path.dirname(os.path.abspath(path)) or "." + fd, tmp = tempfile.mkstemp(prefix=".config.", suffix=".json.tmp", + dir=dir_name) + try: + with os.fdopen(fd, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2, ensure_ascii=False) + f.write("\n") + os.replace(tmp, path) + except Exception: + try: + os.unlink(tmp) + except OSError: + pass + raise + + +def bootstrap_config(config_path: str = "config.json", + example_path: str = "config.json.example") -> None: + """Copy the example into place when config.json is missing.""" + if os.path.exists(config_path): + return + if not os.path.exists(example_path): + print(f"[setup_config] {example_path!r} not found — cannot bootstrap " + f"{config_path!r}.", file=sys.stderr) + return + shutil.copyfile(example_path, config_path) + print(f"[setup_config] Seeded {config_path!r} from {example_path!r}.", + file=sys.stderr) + + +def fix_tesseract_path(config_path: str = "config.json") -> None: + """If ocr.tesseract_cmd in config_path is unset or broken, offer to set + it to a tesseract binary discovered on the system.""" + if not os.path.exists(config_path): + return + try: + with open(config_path, encoding="utf-8") as f: + cfg = json.load(f) + except Exception as e: + print(f"[setup_config] Could not read {config_path!r}: {e}", + file=sys.stderr) + return + + ocr_section = cfg.get("ocr") or {} + configured = ocr_section.get("tesseract_cmd") + + # Case A — the configured path is a real file. Nothing to do. + if configured and os.path.exists(configured): + return + + # Case B — no path set AND tesseract is on PATH. Also fine; pytesseract + # will discover it. Don't pester the user. + if not configured and _find_tesseract_on_path(): + return + + # Case C — broken configured path, OR tesseract is not on PATH but we + # can find it at a well-known location. Either way, offer a fix. + discovered = _find_tesseract() + if discovered is None: + # Truly missing — the install step in the launcher already warned. + return + + if configured: + print(f"[setup_config] ocr.tesseract_cmd in {config_path!r} points to " + f"{configured!r}, which does not exist on this system.", + file=sys.stderr) + else: + print(f"[setup_config] tesseract is not on PATH but was found at " + f"{discovered!r}.", file=sys.stderr) + + if not _confirm( + f" Update ocr.tesseract_cmd to {discovered!r}?" + ): + print(" (skipping — OCR may not work until you fix this manually)", + file=sys.stderr) + return + + cfg.setdefault("ocr", {})["tesseract_cmd"] = discovered + try: + _atomic_write_json(config_path, cfg) + print(f"[setup_config] Updated {config_path!r}: " + f"ocr.tesseract_cmd = {discovered!r}", file=sys.stderr) + except Exception as e: + print(f"[setup_config] Could not write {config_path!r}: {e}", + file=sys.stderr) + + +def main() -> int: + bootstrap_config() + fix_tesseract_path() + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/start-mac.sh b/start-mac.sh index ac683b3..4bf44a6 100755 --- a/start-mac.sh +++ b/start-mac.sh @@ -123,9 +123,13 @@ cat <<'EOF' EOF +# ─── Bootstrap config.json + fix tesseract path ────────────────────────────── + +python3 setup_config.py || true + # ─── Launch ────────────────────────────────────────────────────────────────── -echo " Starting OSScreenObserver (default mode: inspect)…" +echo " Starting OSScreenObserver (auto mode: TTY → inspect, pipe → both)…" echo " Web UI → http://127.0.0.1:5001" echo "" exec python3 main.py "$@" diff --git a/start.bat b/start.bat index fbc2d33..5a0507e 100644 --- a/start.bat +++ b/start.bat @@ -112,10 +112,14 @@ if !ANSWER!==Y ( %PY_CMD% -m pip install -r requirements.txt ) +REM ─── Bootstrap config.json + fix tesseract path ──────────────────────────── + +%PY_CMD% setup_config.py + REM ─── Launch ───────────────────────────────────────────────────────────────── echo. -echo Starting OSScreenObserver (default mode: inspect)... +echo Starting OSScreenObserver (auto mode: TTY -^> inspect, pipe -^> both)... echo Web UI -^> http://127.0.0.1:5001 echo. %PY_CMD% main.py %* diff --git a/start.sh b/start.sh index c1f023e..dad92f8 100755 --- a/start.sh +++ b/start.sh @@ -138,10 +138,14 @@ if confirm "Install/upgrade Python dependencies from requirements.txt?"; then python3 -m pip install -r requirements.txt fi +# ─── Bootstrap config.json + fix tesseract path ────────────────────────────── + +python3 setup_config.py || true + # ─── Launch ────────────────────────────────────────────────────────────────── echo "" -echo " Starting OSScreenObserver (default mode: inspect)…" +echo " Starting OSScreenObserver (auto mode: TTY → inspect, pipe → both)…" echo " Web UI → http://127.0.0.1:5001" echo "" exec python3 main.py "$@" diff --git a/tests/test_setup_config.py b/tests/test_setup_config.py new file mode 100644 index 0000000..112f999 --- /dev/null +++ b/tests/test_setup_config.py @@ -0,0 +1,132 @@ +"""Tests for setup_config.py — config bootstrap and tesseract-path fixup.""" +from __future__ import annotations + +import json +import os +from unittest.mock import patch + +import setup_config + + +def _read(path) -> dict: + with open(path, encoding="utf-8") as f: + return json.load(f) + + +# ─── bootstrap_config ──────────────────────────────────────────────────────── + +def test_bootstrap_copies_example_when_config_missing(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + (tmp_path / "config.json.example").write_text('{"ocr": {}}', encoding="utf-8") + assert not (tmp_path / "config.json").exists() + setup_config.bootstrap_config() + assert (tmp_path / "config.json").exists() + assert _read(tmp_path / "config.json") == {"ocr": {}} + + +def test_bootstrap_leaves_existing_config_alone(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + (tmp_path / "config.json").write_text('{"existing": true}', encoding="utf-8") + (tmp_path / "config.json.example").write_text('{"ocr": {}}', encoding="utf-8") + setup_config.bootstrap_config() + assert _read(tmp_path / "config.json") == {"existing": True} + + +def test_bootstrap_silent_when_example_missing(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + # Neither file exists. + setup_config.bootstrap_config() + assert not (tmp_path / "config.json").exists() + + +# ─── fix_tesseract_path ────────────────────────────────────────────────────── + +def _cfg(tmp_path, ocr=None): + cfg = {"ocr": dict(ocr) if ocr else {}} + p = tmp_path / "config.json" + p.write_text(json.dumps(cfg), encoding="utf-8") + return str(p) + + +def test_fix_path_no_op_when_configured_path_exists(tmp_path): + real = tmp_path / "fake_tesseract" + real.touch() + path = _cfg(tmp_path, ocr={"tesseract_cmd": str(real)}) + with patch("setup_config._find_tesseract") as mock_find, \ + patch("setup_config._confirm") as mock_confirm: + setup_config.fix_tesseract_path(path) + mock_find.assert_not_called() + mock_confirm.assert_not_called() + assert _read(path)["ocr"]["tesseract_cmd"] == str(real) + + +def test_fix_path_no_op_when_unset_but_on_path(tmp_path): + path = _cfg(tmp_path, ocr={"tesseract_cmd": None}) + with patch("setup_config._find_tesseract_on_path", + return_value="/usr/bin/tesseract"), \ + patch("setup_config._confirm") as mock_confirm: + setup_config.fix_tesseract_path(path) + mock_confirm.assert_not_called() + # Config unchanged. + assert _read(path)["ocr"]["tesseract_cmd"] is None + + +def test_fix_path_updates_when_configured_path_broken(tmp_path): + discovered = tmp_path / "real_tesseract" + discovered.touch() + path = _cfg(tmp_path, ocr={ + "tesseract_cmd": "C:\\Program Files\\Tesseract-OCR\\tesseract.exe" + }) + with patch("setup_config._find_tesseract", return_value=str(discovered)), \ + patch("setup_config._confirm", return_value=True): + setup_config.fix_tesseract_path(path) + assert _read(path)["ocr"]["tesseract_cmd"] == str(discovered) + + +def test_fix_path_respects_user_decline(tmp_path): + discovered = tmp_path / "real_tesseract" + discovered.touch() + broken = "C:\\Program Files\\Tesseract-OCR\\tesseract.exe" + path = _cfg(tmp_path, ocr={"tesseract_cmd": broken}) + with patch("setup_config._find_tesseract", return_value=str(discovered)), \ + patch("setup_config._confirm", return_value=False): + setup_config.fix_tesseract_path(path) + # User said no — leave the broken path alone. + assert _read(path)["ocr"]["tesseract_cmd"] == broken + + +def test_fix_path_no_op_when_tesseract_missing(tmp_path): + path = _cfg(tmp_path, ocr={"tesseract_cmd": "/nonexistent/tesseract"}) + with patch("setup_config._find_tesseract", return_value=None), \ + patch("setup_config._confirm") as mock_confirm: + setup_config.fix_tesseract_path(path) + mock_confirm.assert_not_called() + # Config unchanged — the launcher's install step already warned. + assert _read(path)["ocr"]["tesseract_cmd"] == "/nonexistent/tesseract" + + +def test_fix_path_creates_ocr_section_if_missing(tmp_path): + discovered = tmp_path / "real_tesseract" + discovered.touch() + p = tmp_path / "config.json" + p.write_text('{"vlm": {"enabled": true}}', encoding="utf-8") + with patch("setup_config._find_tesseract_on_path", return_value=None), \ + patch("setup_config._find_tesseract", return_value=str(discovered)), \ + patch("setup_config._confirm", return_value=True): + setup_config.fix_tesseract_path(str(p)) + cfg = _read(p) + assert cfg["ocr"]["tesseract_cmd"] == str(discovered) + # Other sections preserved. + assert cfg["vlm"] == {"enabled": True} + + +def test_fix_path_atomic_no_stray_tempfile(tmp_path): + discovered = tmp_path / "real_tesseract" + discovered.touch() + path = _cfg(tmp_path, ocr={"tesseract_cmd": "/broken/path"}) + with patch("setup_config._find_tesseract", return_value=str(discovered)), \ + patch("setup_config._confirm", return_value=True): + setup_config.fix_tesseract_path(path) + # No leftover temp file from the atomic save. + leftovers = [n for n in os.listdir(tmp_path) if n.endswith(".tmp")] + assert leftovers == [] From 7f3afe5ce8ef184a9b78b1ab27abee07ae562cad Mon Sep 17 00:00:00 2001 From: William Mongan Date: Sun, 17 May 2026 15:59:01 -0400 Subject: [PATCH 7/7] Replace deprecated mss.mss() with mss.MSS() mss.mss is deprecated and will be removed in a future release. All six call sites in observer.py updated. Co-Authored-By: Claude Sonnet 4.6 --- observer.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/observer.py b/observer.py index 97f88f9..6ea8b3d 100644 --- a/observer.py +++ b/observer.py @@ -755,7 +755,7 @@ def get_screenshot(self, hwnd=None) -> Optional[bytes]: from PIL import Image import win32gui - with mss.mss() as sct: + with mss.MSS() as sct: if hwnd: rect = win32gui.GetWindowRect(hwnd) region = {"left": rect[0], "top": rect[1], @@ -839,7 +839,7 @@ def get_screenshot(self, hwnd=None) -> Optional[bytes]: try: import mss from PIL import Image - with mss.mss() as sct: + with mss.MSS() as sct: raw = sct.grab(sct.monitors[1]) img = Image.frombytes("RGB", raw.size, raw.bgra, "raw", "BGRX") buf = io.BytesIO() @@ -968,7 +968,7 @@ def get_screenshot(self, hwnd=None) -> Optional[bytes]: try: import mss from PIL import Image - with mss.mss() as sct: + with mss.MSS() as sct: raw = sct.grab(sct.monitors[1]) img = Image.frombytes("RGB", raw.size, raw.bgra, "raw", "BGRX") buf = io.BytesIO() @@ -1184,7 +1184,7 @@ def get_full_display_screenshot(self) -> Optional[bytes]: try: import mss from PIL import Image - with mss.mss() as sct: + with mss.MSS() as sct: raw = sct.grab(sct.monitors[0]) # 0 = union of all monitors img = Image.frombytes("RGB", raw.size, raw.bgra, "raw", "BGRX") buf = io.BytesIO() @@ -1250,7 +1250,7 @@ def get_monitors(self) -> List[Dict[str, Any]]: """Return per-monitor metadata via mss.""" try: import mss - with mss.mss() as sct: + with mss.MSS() as sct: mons = sct.monitors # [0] is union; [1..] are individual out: List[Dict[str, Any]] = [] for i, m in enumerate(mons[1:]): @@ -1357,7 +1357,7 @@ def get_screen_bounds(self) -> Bounds: """Return the bounding rect of the combined virtual screen (all monitors).""" try: import mss - with mss.mss() as sct: + with mss.MSS() as sct: m = sct.monitors[0] # index 0 = union of all monitors return Bounds(m["left"], m["top"], m["width"], m["height"]) except Exception: