Textagent
diff --git a/‎ai-worker-detr.js‎
Lines changed: 137 additions & 0 deletions b/‎ai-worker-detr.js‎
Lines changed: 137 additions & 0 deletions
diff --git a/‎changelogs/CHANGELOG-rt-detr-vision-pipeline.md‎
Lines changed: 43 additions & 0 deletions b/‎changelogs/CHANGELOG-rt-detr-vision-pipeline.md‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎css/ai-docgen.css‎
Lines changed: 132 additions & 0 deletions b/‎css/ai-docgen.css‎
Lines changed: 132 additions & 0 deletions
@@ -0,0 +1,137 @@
+// ============================================
+// ai-worker-detr.js — RT-DETR Object Detection Worker
+// Model: onnx-community/rt-detr-r18-cppe5  (COCO-80 fallback: onnx-community/rt-detr-v2-r18-enc3-coco)
+// Pipeline: object-detection
+// Role: First-pass detector — feeds bounding boxes + labels to Gemma 4
+// ============================================
+
+const TRANSFORMERS_URL = "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.5.2";
+
+// Primary: small RT-DETR r18 trained on COCO (80 classes, ~83 MB)
+const MODEL_PRIMARY  = "onnx-community/rt-detr-r18-enc3-coco";
+const MODEL_FALLBACK = "onnx-community/rtdetr-r18-cppe5";
+let   MODEL_ID       = MODEL_PRIMARY;
+
+let pipeline = null;
+let detector = null;
+
+self.postMessage({ type: "status", message: "[DETR] Initialising RT-DETR worker…" });
+
+// ─────────────────────────────────────────────
+// Progress callback
+// ─────────────────────────────────────────────
+function makeProgressCb() {
+    return (progress) => {
+        if (progress.status === "progress") {
+            self.postMessage({
+                type: "progress",
+                file: progress.file || "RT-DETR",
+                loaded: progress.loaded || 0,
+                total: progress.total || 0,
+                progress: progress.progress || 0,
+                source: MODEL_ID,
+            });
+        } else if (progress.status === "initiate") {
+            self.postMessage({ type: "status", message: `Loading ${progress.file || "RT-DETR"}…`, loadingPhase: "initiate" });
+        } else if (progress.status === "done") {
+            self.postMessage({ type: "status", message: `Loaded ${progress.file || "RT-DETR"} ✓`, loadingPhase: "done" });
+        }
+    };
+}
+
+// ─────────────────────────────────────────────
+// Load model
+// ─────────────────────────────────────────────
+async function loadModel() {
+    try {
+        self.postMessage({ type: "status", message: "Initialising RT-DETR (object detection)…" });
+
+        const transformers = await import(TRANSFORMERS_URL);
+        pipeline = transformers.pipeline;
+
+        // WebGPU if available
+        let device = "wasm";
+        if (typeof navigator !== "undefined" && navigator.gpu) {
+            const adapter = await navigator.gpu.requestAdapter();
+            if (adapter) device = "webgpu";
+        }
+        self.postMessage({ type: "status", message: `RT-DETR using ${device.toUpperCase()}…` });
+
+        async function tryLoad(modelId) {
+            detector = await pipeline("object-detection", modelId, {
+                device,
+                progress_callback: makeProgressCb(),
+            });
+        }
+
+        try {
+            await tryLoad(MODEL_PRIMARY);
+        } catch (e) {
+            console.warn("[DETR] Primary model failed:", e.message, "→ falling back to", MODEL_FALLBACK);
+            self.postMessage({ type: "status", message: "Falling back to CPPE-5 RT-DETR model…" });
+            MODEL_ID = MODEL_FALLBACK;
+            await tryLoad(MODEL_FALLBACK);
+        }
+
+        self.postMessage({ type: "loaded", device, modelId: MODEL_ID });
+    } catch (err) {
+        self.postMessage({ type: "error", message: `RT-DETR failed to load: ${err.message}` });
+    }
+}
+
+// ─────────────────────────────────────────────
+// Detect — returns structured detection list
+// ─────────────────────────────────────────────
+async function detect({ imageData, threshold = 0.35, messageId }) {
+    if (!detector) {
+        self.postMessage({ type: "error", message: "RT-DETR not loaded yet.", messageId });
+        return;
+    }
+
+    try {
+        self.postMessage({ type: "status", message: "Running RT-DETR object detection…", messageId });
+
+        // imageData is a base64 data-URL string (data:image/jpeg;base64,...)
+        const output = await detector(imageData, { threshold });
+
+        // output: [{ score, label, box: { xmin, ymin, xmax, ymax } }, ...]
+        // Sort by confidence descending
+        const sorted = [...output].sort((a, b) => b.score - a.score);
+
+        self.postMessage({
+            type: "detections",
+            detections: sorted,
+            messageId,
+        });
+    } catch (err) {
+        self.postMessage({ type: "error", message: `RT-DETR detection failed: ${err.message}`, messageId });
+    }
+}
+
+// ─────────────────────────────────────────────
+// Message handler
+// ─────────────────────────────────────────────
+self.addEventListener("message", async (event) => {
+    const { type } = event.data;
+
+    switch (type) {
+        case "load":
+            await loadModel();
+            break;
+
+        case "detect":
+            await detect({
+                imageData: event.data.imageData,
+                threshold: event.data.threshold || 0.35,
+                messageId: event.data.messageId,
+            });
+            break;
+
+        case "ping":
+            self.postMessage({ type: "pong" });
+            break;
+
+        default:
+            console.warn("[DETR worker] Unknown message type:", type);
+    }
+});
@@ -0,0 +1,43 @@
+# Two-Stage RT-DETR → Gemma 4 Vision Pipeline
+
+- New `ai-worker-detr.js` — transient Web Worker for RT-DETR object detection using `@huggingface/transformers@3.5.2`
+- Primary model `onnx-community/rt-detr-r18-enc3-coco` (80 COCO classes, ~85 MB) with fallback to `onnx-community/rtdetr-r18-cppe5`
+- WebGPU-first with WASM fallback; detection threshold: 0.3
+- `rt-detr` model entry added to `ai-models.js` with `hidden: true` (not shown in main selector — used internally by Vision cards)
+- `@detect: yes` / `@detect: no` field parsed in Vision card renderer (`ai-docgen.js`)
+- 🔍 **Detect toggle button** added to Vision card header — toggles `active` CSS class, updates `@detect:` field in markdown source, updates card's `data-detect` attribute and modality hint footer inline
+- `.ai-vision-detections` panel div now rendered into every Vision card (hidden by default, appears when RT-DETR runs)
+- Modality hint footer updates to show: `· 🔍 RT-DETR detect → Gemma 4 describe` when detect is active
+- `runDetrDetection(imageDataUrl, blockIndex)` in `ai-docgen-generate.js`: spins up a transient DETR worker, streams load progress into the detection panel, sends the image for detection, awaits results, calls `renderDetrDetections()`, terminates worker; 120 s timeout safety
+- `renderDetrDetections(imageDataUrl, detections, blockIndex)`: draws image onto Canvas at max 520px width with colour-coded bounding boxes (label + % confidence drawn on box), renders per-class confidence pills below using `DETR_COLORS` palette (10-colour rotating)
+- Vision block handler in `ai-docgen-generate.js` restructured as explicit two stages:
+  - **Stage 1**: if `data-detect="true"` + image attachment present → `runDetrDetection()` → structured detection list
+  - **Stage 2**: inject `"Detected objects (from RT-DETR): person (97%), laptop (94%), …\n\n[user prompt]"` into Gemma 4 → Gemma 4 receives both visual tokens + detection anchors for grounded, richer descriptions
+- CSS additions in `ai-docgen.css`:
+  - `.ai-vision-detections` panel (dark bg, cyan border, `fadeInDown` animation)
+  - `.ai-detr-canvas` (full-width, rounded top corners)
+  - `.ai-detr-header` + `.ai-detr-model-badge` (uppercase pill badge)
+  - `.ai-detr-pills` + `.ai-detr-pill` — per-class confidence pills with `color-mix()` theming via `--detr-color` CSS variable
+  - `.ai-detr-pill-dot` — colour swatch dot
+  - `.ai-detr-progress` / `.ai-detr-progress-bar` — cyan gradient download progress bar
+  - `.ai-detr-status` / `.ai-detr-empty` — italic status + empty state text
+  - `.ai-vision-detect-toggle.active` — glowing cyan highlight when detect is enabled
+  - Light theme variants for all new elements
+
+---
+
+## Summary
+
+Implements a two-stage object detection + scene description pipeline directly inside the `{{@Vision:}}` DocGen card. When the user clicks 🔍 Detect, RT-DETR (80-class COCO, ~85 MB, runs locally via WebGPU/WASM) runs as a first pass on the uploaded image — drawing colour-coded bounding boxes onto a Canvas overlay and rendering confidence pill badges. The detected objects are then automatically injected as structured context into the Gemma 4 prompt, giving the model grounded object anchors for richer, more accurate scene descriptions. Inspired by Roboflow's RF-DETR + Gemma demo workflow.
+
+---
+
+## Files Changed (5 total)
+
+| File | Type | Description |
+|------|------|-------------|
+| `ai-worker-detr.js` | NEW | RT-DETR Web Worker — object detection pipeline |
+| `js/ai-models.js` | MODIFY | `rt-detr` model registry entry (hidden, internal) |
+| `js/ai-docgen.js` | MODIFY | `@detect:` field parsing, 🔍 toggle button, detect toggle click handler |
+| `js/ai-docgen-generate.js` | MODIFY | `runDetrDetection()`, `renderDetrDetections()`, two-stage Vision block handler |
+| `css/ai-docgen.css` | MODIFY | Detection panel, bbox canvas, pills, progress bar, active detect button |
@@ -349,6 +349,138 @@
     color: rgba(43, 108, 176, 0.6);
 }
 
+/* --- RT-DETR Detection Panel (inside Vision cards) --- */
+.ai-vision-detections {
+    margin: 0 12px 10px;
+    border-radius: 8px;
+    border: 1px solid rgba(99, 179, 237, 0.2);
+    background: rgba(10, 22, 40, 0.6);
+    overflow: hidden;
+    animation: fadeInDown 0.25s ease;
+}
+
+@keyframes fadeInDown {
+    from { opacity: 0; transform: translateY(-6px); }
+    to   { opacity: 1; transform: translateY(0); }
+}
+
+/* Canvas with bounding box overlays */
+.ai-detr-canvas {
+    display: block;
+    width: 100%;
+    height: auto;
+    border-radius: 6px 6px 0 0;
+    border-bottom: 1px solid rgba(99, 179, 237, 0.15);
+}
+
+/* Detection count + model badge header */
+.ai-detr-header {
+    display: flex;
+    align-items: center;
+    gap: 8px;
+    padding: 7px 12px 4px;
+    font-size: 0.78em;
+    color: #bee3f8;
+}
+
+.ai-detr-model-badge {
+    font-size: 0.72em;
+    font-weight: 700;
+    letter-spacing: 0.04em;
+    text-transform: uppercase;
+    background: rgba(99, 179, 237, 0.18);
+    border: 1px solid rgba(99, 179, 237, 0.3);
+    border-radius: 4px;
+    padding: 1px 6px;
+    color: #63b3ed;
+}
+
+/* Pill strip — one pill per detected class */
+.ai-detr-pills {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 5px;
+    padding: 4px 10px 10px;
+}
+
+.ai-detr-pill {
+    display: inline-flex;
+    align-items: center;
+    gap: 5px;
+    font-size: 0.74em;
+    border: 1px solid;
+    border-color: color-mix(in srgb, var(--detr-color, #63b3ed) 50%, transparent);
+    background: color-mix(in srgb, var(--detr-color, #63b3ed) 12%, transparent);
+    color: #e2e8f0;
+    border-radius: 12px;
+    padding: 2px 10px 2px 7px;
+    white-space: nowrap;
+    transition: all 0.15s ease;
+}
+
+.ai-detr-pill:hover {
+    background: color-mix(in srgb, var(--detr-color, #63b3ed) 22%, transparent);
+}
+
+.ai-detr-pill-dot {
+    width: 7px;
+    height: 7px;
+    border-radius: 50%;
+    flex-shrink: 0;
+}
+
+/* Loading progress bar */
+.ai-detr-progress {
+    height: 3px;
+    background: rgba(99, 179, 237, 0.1);
+    margin: 4px 12px 12px;
+    border-radius: 2px;
+    overflow: hidden;
+}
+
+.ai-detr-progress-bar {
+    height: 100%;
+    background: linear-gradient(90deg, #00d4ff, #63b3ed);
+    border-radius: 2px;
+    transition: width 0.3s ease;
+}
+
+/* Status text during loading */
+.ai-detr-status {
+    padding: 10px 12px;
+    font-size: 0.78em;
+    color: rgba(99, 179, 237, 0.75);
+    font-style: italic;
+}
+
+/* No detections fallback */
+.ai-detr-empty {
+    padding: 10px 12px;
+    font-size: 0.78em;
+    color: rgba(99, 179, 237, 0.5);
+}
+
+/* 🔍 Detect toggle button — active state */
+.ai-vision-detect-toggle.active {
+    background: rgba(99, 179, 237, 0.28) !important;
+    border-color: rgba(99, 179, 237, 0.55) !important;
+    color: #00d4ff !important;
+    font-weight: 700;
+}
+
+/* Light theme adjustments */
+[data-theme="light"] .ai-vision-detections {
+    background: rgba(235, 248, 255, 0.8);
+    border-color: rgba(43, 108, 176, 0.25);
+}
+
+[data-theme="light"] .ai-detr-header,
+[data-theme="light"] .ai-detr-status,
+[data-theme="light"] .ai-detr-pill {
+    color: #2c5282;
+}
+
+
 
 /* --- OCR Mode Pills --- */
 .ai-ocr-mode-pills {