Skip to content

Commit d207965

Browse files
committed
feat: Two-stage RT-DETR → Gemma 4 Vision pipeline
- New ai-worker-detr.js: transient Web Worker using @huggingface/transformers@3.5.2 pipeline('object-detection', 'onnx-community/rt-detr-r18-enc3-coco') WebGPU-first with WASM fallback; CPPE-5 fallback model; threshold: 0.3 - ai-models.js: added rt-detr model entry (hidden, doc-model, ~85 MB) - ai-docgen.js: @Detect: yes/@Detect: no field parsed on Vision cards Detect toggle button in Vision card header; modality hint footer updates inline .ai-vision-detections panel included in every Vision card (hidden by default) - ai-docgen-generate.js: runDetrDetection() + renderDetrDetections() Stage 1: RT-DETR detections with bbox canvas overlay + confidence pills Stage 2: detection list injected into Gemma 4 prompt as structured context Gemma 4 receives visual tokens + object anchors for grounded descriptions - css/ai-docgen.css: detection panel, canvas, pills (color-mix theming), progress bar, model badge, active detect toggle, fadeInDown animation
1 parent 0f136a7 commit d207965

6 files changed

Lines changed: 592 additions & 7 deletions

File tree

ai-worker-detr.js

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
// ============================================
2+
// ai-worker-detr.js — RT-DETR Object Detection Worker
3+
// Model: onnx-community/rt-detr-r18-cppe5 (COCO-80 fallback: onnx-community/rt-detr-v2-r18-enc3-coco)
4+
// Pipeline: object-detection
5+
// Role: First-pass detector — feeds bounding boxes + labels to Gemma 4
6+
// ============================================
7+
8+
const TRANSFORMERS_URL = "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.5.2";
9+
10+
// Primary: small RT-DETR r18 trained on COCO (80 classes, ~83 MB)
11+
const MODEL_PRIMARY = "onnx-community/rt-detr-r18-enc3-coco";
12+
const MODEL_FALLBACK = "onnx-community/rtdetr-r18-cppe5";
13+
let MODEL_ID = MODEL_PRIMARY;
14+
15+
let pipeline = null;
16+
let detector = null;
17+
18+
self.postMessage({ type: "status", message: "[DETR] Initialising RT-DETR worker…" });
19+
20+
// ─────────────────────────────────────────────
21+
// Progress callback
22+
// ─────────────────────────────────────────────
23+
function makeProgressCb() {
24+
return (progress) => {
25+
if (progress.status === "progress") {
26+
self.postMessage({
27+
type: "progress",
28+
file: progress.file || "RT-DETR",
29+
loaded: progress.loaded || 0,
30+
total: progress.total || 0,
31+
progress: progress.progress || 0,
32+
source: MODEL_ID,
33+
});
34+
} else if (progress.status === "initiate") {
35+
self.postMessage({ type: "status", message: `Loading ${progress.file || "RT-DETR"}…`, loadingPhase: "initiate" });
36+
} else if (progress.status === "done") {
37+
self.postMessage({ type: "status", message: `Loaded ${progress.file || "RT-DETR"} ✓`, loadingPhase: "done" });
38+
}
39+
};
40+
}
41+
42+
// ─────────────────────────────────────────────
43+
// Load model
44+
// ─────────────────────────────────────────────
45+
async function loadModel() {
46+
try {
47+
self.postMessage({ type: "status", message: "Initialising RT-DETR (object detection)…" });
48+
49+
const transformers = await import(TRANSFORMERS_URL);
50+
pipeline = transformers.pipeline;
51+
52+
// WebGPU if available
53+
let device = "wasm";
54+
if (typeof navigator !== "undefined" && navigator.gpu) {
55+
const adapter = await navigator.gpu.requestAdapter();
56+
if (adapter) device = "webgpu";
57+
}
58+
self.postMessage({ type: "status", message: `RT-DETR using ${device.toUpperCase()}…` });
59+
60+
async function tryLoad(modelId) {
61+
detector = await pipeline("object-detection", modelId, {
62+
device,
63+
progress_callback: makeProgressCb(),
64+
});
65+
}
66+
67+
try {
68+
await tryLoad(MODEL_PRIMARY);
69+
} catch (e) {
70+
console.warn("[DETR] Primary model failed:", e.message, "→ falling back to", MODEL_FALLBACK);
71+
self.postMessage({ type: "status", message: "Falling back to CPPE-5 RT-DETR model…" });
72+
MODEL_ID = MODEL_FALLBACK;
73+
await tryLoad(MODEL_FALLBACK);
74+
}
75+
76+
self.postMessage({ type: "loaded", device, modelId: MODEL_ID });
77+
} catch (err) {
78+
self.postMessage({ type: "error", message: `RT-DETR failed to load: ${err.message}` });
79+
}
80+
}
81+
82+
// ─────────────────────────────────────────────
83+
// Detect — returns structured detection list
84+
// ─────────────────────────────────────────────
85+
async function detect({ imageData, threshold = 0.35, messageId }) {
86+
if (!detector) {
87+
self.postMessage({ type: "error", message: "RT-DETR not loaded yet.", messageId });
88+
return;
89+
}
90+
91+
try {
92+
self.postMessage({ type: "status", message: "Running RT-DETR object detection…", messageId });
93+
94+
// imageData is a base64 data-URL string (data:image/jpeg;base64,...)
95+
const output = await detector(imageData, { threshold });
96+
97+
// output: [{ score, label, box: { xmin, ymin, xmax, ymax } }, ...]
98+
// Sort by confidence descending
99+
const sorted = [...output].sort((a, b) => b.score - a.score);
100+
101+
self.postMessage({
102+
type: "detections",
103+
detections: sorted,
104+
messageId,
105+
});
106+
} catch (err) {
107+
self.postMessage({ type: "error", message: `RT-DETR detection failed: ${err.message}`, messageId });
108+
}
109+
}
110+
111+
// ─────────────────────────────────────────────
112+
// Message handler
113+
// ─────────────────────────────────────────────
114+
self.addEventListener("message", async (event) => {
115+
const { type } = event.data;
116+
117+
switch (type) {
118+
case "load":
119+
await loadModel();
120+
break;
121+
122+
case "detect":
123+
await detect({
124+
imageData: event.data.imageData,
125+
threshold: event.data.threshold || 0.35,
126+
messageId: event.data.messageId,
127+
});
128+
break;
129+
130+
case "ping":
131+
self.postMessage({ type: "pong" });
132+
break;
133+
134+
default:
135+
console.warn("[DETR worker] Unknown message type:", type);
136+
}
137+
});
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Two-Stage RT-DETR → Gemma 4 Vision Pipeline
2+
3+
- New `ai-worker-detr.js` — transient Web Worker for RT-DETR object detection using `@huggingface/transformers@3.5.2`
4+
- Primary model `onnx-community/rt-detr-r18-enc3-coco` (80 COCO classes, ~85 MB) with fallback to `onnx-community/rtdetr-r18-cppe5`
5+
- WebGPU-first with WASM fallback; detection threshold: 0.3
6+
- `rt-detr` model entry added to `ai-models.js` with `hidden: true` (not shown in main selector — used internally by Vision cards)
7+
- `@detect: yes` / `@detect: no` field parsed in Vision card renderer (`ai-docgen.js`)
8+
- 🔍 **Detect toggle button** added to Vision card header — toggles `active` CSS class, updates `@detect:` field in markdown source, updates card's `data-detect` attribute and modality hint footer inline
9+
- `.ai-vision-detections` panel div now rendered into every Vision card (hidden by default, appears when RT-DETR runs)
10+
- Modality hint footer updates to show: `· 🔍 RT-DETR detect → Gemma 4 describe` when detect is active
11+
- `runDetrDetection(imageDataUrl, blockIndex)` in `ai-docgen-generate.js`: spins up a transient DETR worker, streams load progress into the detection panel, sends the image for detection, awaits results, calls `renderDetrDetections()`, terminates worker; 120 s timeout safety
12+
- `renderDetrDetections(imageDataUrl, detections, blockIndex)`: draws image onto Canvas at max 520px width with colour-coded bounding boxes (label + % confidence drawn on box), renders per-class confidence pills below using `DETR_COLORS` palette (10-colour rotating)
13+
- Vision block handler in `ai-docgen-generate.js` restructured as explicit two stages:
14+
- **Stage 1**: if `data-detect="true"` + image attachment present → `runDetrDetection()` → structured detection list
15+
- **Stage 2**: inject `"Detected objects (from RT-DETR): person (97%), laptop (94%), …\n\n[user prompt]"` into Gemma 4 → Gemma 4 receives both visual tokens + detection anchors for grounded, richer descriptions
16+
- CSS additions in `ai-docgen.css`:
17+
- `.ai-vision-detections` panel (dark bg, cyan border, `fadeInDown` animation)
18+
- `.ai-detr-canvas` (full-width, rounded top corners)
19+
- `.ai-detr-header` + `.ai-detr-model-badge` (uppercase pill badge)
20+
- `.ai-detr-pills` + `.ai-detr-pill` — per-class confidence pills with `color-mix()` theming via `--detr-color` CSS variable
21+
- `.ai-detr-pill-dot` — colour swatch dot
22+
- `.ai-detr-progress` / `.ai-detr-progress-bar` — cyan gradient download progress bar
23+
- `.ai-detr-status` / `.ai-detr-empty` — italic status + empty state text
24+
- `.ai-vision-detect-toggle.active` — glowing cyan highlight when detect is enabled
25+
- Light theme variants for all new elements
26+
27+
---
28+
29+
## Summary
30+
31+
Implements a two-stage object detection + scene description pipeline directly inside the `{{@Vision:}}` DocGen card. When the user clicks 🔍 Detect, RT-DETR (80-class COCO, ~85 MB, runs locally via WebGPU/WASM) runs as a first pass on the uploaded image — drawing colour-coded bounding boxes onto a Canvas overlay and rendering confidence pill badges. The detected objects are then automatically injected as structured context into the Gemma 4 prompt, giving the model grounded object anchors for richer, more accurate scene descriptions. Inspired by Roboflow's RF-DETR + Gemma demo workflow.
32+
33+
---
34+
35+
## Files Changed (5 total)
36+
37+
| File | Type | Description |
38+
|------|------|-------------|
39+
| `ai-worker-detr.js` | NEW | RT-DETR Web Worker — object detection pipeline |
40+
| `js/ai-models.js` | MODIFY | `rt-detr` model registry entry (hidden, internal) |
41+
| `js/ai-docgen.js` | MODIFY | `@detect:` field parsing, 🔍 toggle button, detect toggle click handler |
42+
| `js/ai-docgen-generate.js` | MODIFY | `runDetrDetection()`, `renderDetrDetections()`, two-stage Vision block handler |
43+
| `css/ai-docgen.css` | MODIFY | Detection panel, bbox canvas, pills, progress bar, active detect button |

css/ai-docgen.css

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,138 @@
349349
color: rgba(43, 108, 176, 0.6);
350350
}
351351

352+
/* --- RT-DETR Detection Panel (inside Vision cards) --- */
353+
.ai-vision-detections {
354+
margin: 0 12px 10px;
355+
border-radius: 8px;
356+
border: 1px solid rgba(99, 179, 237, 0.2);
357+
background: rgba(10, 22, 40, 0.6);
358+
overflow: hidden;
359+
animation: fadeInDown 0.25s ease;
360+
}
361+
362+
@keyframes fadeInDown {
363+
from { opacity: 0; transform: translateY(-6px); }
364+
to { opacity: 1; transform: translateY(0); }
365+
}
366+
367+
/* Canvas with bounding box overlays */
368+
.ai-detr-canvas {
369+
display: block;
370+
width: 100%;
371+
height: auto;
372+
border-radius: 6px 6px 0 0;
373+
border-bottom: 1px solid rgba(99, 179, 237, 0.15);
374+
}
375+
376+
/* Detection count + model badge header */
377+
.ai-detr-header {
378+
display: flex;
379+
align-items: center;
380+
gap: 8px;
381+
padding: 7px 12px 4px;
382+
font-size: 0.78em;
383+
color: #bee3f8;
384+
}
385+
386+
.ai-detr-model-badge {
387+
font-size: 0.72em;
388+
font-weight: 700;
389+
letter-spacing: 0.04em;
390+
text-transform: uppercase;
391+
background: rgba(99, 179, 237, 0.18);
392+
border: 1px solid rgba(99, 179, 237, 0.3);
393+
border-radius: 4px;
394+
padding: 1px 6px;
395+
color: #63b3ed;
396+
}
397+
398+
/* Pill strip — one pill per detected class */
399+
.ai-detr-pills {
400+
display: flex;
401+
flex-wrap: wrap;
402+
gap: 5px;
403+
padding: 4px 10px 10px;
404+
}
405+
406+
.ai-detr-pill {
407+
display: inline-flex;
408+
align-items: center;
409+
gap: 5px;
410+
font-size: 0.74em;
411+
border: 1px solid;
412+
border-color: color-mix(in srgb, var(--detr-color, #63b3ed) 50%, transparent);
413+
background: color-mix(in srgb, var(--detr-color, #63b3ed) 12%, transparent);
414+
color: #e2e8f0;
415+
border-radius: 12px;
416+
padding: 2px 10px 2px 7px;
417+
white-space: nowrap;
418+
transition: all 0.15s ease;
419+
}
420+
421+
.ai-detr-pill:hover {
422+
background: color-mix(in srgb, var(--detr-color, #63b3ed) 22%, transparent);
423+
}
424+
425+
.ai-detr-pill-dot {
426+
width: 7px;
427+
height: 7px;
428+
border-radius: 50%;
429+
flex-shrink: 0;
430+
}
431+
432+
/* Loading progress bar */
433+
.ai-detr-progress {
434+
height: 3px;
435+
background: rgba(99, 179, 237, 0.1);
436+
margin: 4px 12px 12px;
437+
border-radius: 2px;
438+
overflow: hidden;
439+
}
440+
441+
.ai-detr-progress-bar {
442+
height: 100%;
443+
background: linear-gradient(90deg, #00d4ff, #63b3ed);
444+
border-radius: 2px;
445+
transition: width 0.3s ease;
446+
}
447+
448+
/* Status text during loading */
449+
.ai-detr-status {
450+
padding: 10px 12px;
451+
font-size: 0.78em;
452+
color: rgba(99, 179, 237, 0.75);
453+
font-style: italic;
454+
}
455+
456+
/* No detections fallback */
457+
.ai-detr-empty {
458+
padding: 10px 12px;
459+
font-size: 0.78em;
460+
color: rgba(99, 179, 237, 0.5);
461+
}
462+
463+
/* 🔍 Detect toggle button — active state */
464+
.ai-vision-detect-toggle.active {
465+
background: rgba(99, 179, 237, 0.28) !important;
466+
border-color: rgba(99, 179, 237, 0.55) !important;
467+
color: #00d4ff !important;
468+
font-weight: 700;
469+
}
470+
471+
/* Light theme adjustments */
472+
[data-theme="light"] .ai-vision-detections {
473+
background: rgba(235, 248, 255, 0.8);
474+
border-color: rgba(43, 108, 176, 0.25);
475+
}
476+
477+
[data-theme="light"] .ai-detr-header,
478+
[data-theme="light"] .ai-detr-status,
479+
[data-theme="light"] .ai-detr-pill {
480+
color: #2c5282;
481+
}
482+
483+
352484

353485
/* --- OCR Mode Pills --- */
354486
.ai-ocr-mode-pills {

0 commit comments

Comments
 (0)