diff --git a/packages/cli/src/capture/contentExtractor.test.ts b/packages/cli/src/capture/contentExtractor.test.ts index 968365384..236385a54 100644 --- a/packages/cli/src/capture/contentExtractor.test.ts +++ b/packages/cli/src/capture/contentExtractor.test.ts @@ -4,10 +4,9 @@ import { tmpdir } from "node:os"; import { join } from "node:path"; import { captionImagesWithGemini } from "./contentExtractor.js"; -// These tests exercise the OpenRouter provider path only — it makes a plain -// `fetch` call we can stub, with no native (`sharp`) or `@google/genai` -// dependency. OpenRouter wins over Gemini when OPENROUTER_API_KEY is set, so we -// don't need to clear the Gemini keys for the OpenRouter cases. +// These tests exercise the OpenRouter and custom-endpoint provider paths only — +// both make a plain `fetch` call we can stub, with no native (`sharp`) or +// `@google/genai` dependency. function makeProjectWithImage(): string { const dir = mkdtempSync(join(tmpdir(), "hf-caption-")); @@ -33,8 +32,6 @@ describe("captionImagesWithGemini — OpenRouter provider", () => { vi.stubEnv("OPENROUTER_API_KEY", "or-test-key"); vi.stubEnv("HYPERFRAMES_OPENROUTER_MODEL", "google/gemini-3.1-flash-lite"); - // Capture the request inside the mock, where the args are well-typed — - // avoids casting `mock.calls` (and the repo's ban on `as` assertions). let capturedUrl: string | undefined; let capturedInit: RequestInit | undefined; const fetchMock = vi.fn(async (url: string, init?: RequestInit) => { @@ -75,9 +72,6 @@ describe("captionImagesWithGemini — OpenRouter provider", () => { ); const warnings: string[] = []; - // captionOne throws on !res.ok, but the throw is per-image inside - // Promise.allSettled, so it's filtered out as a rejected result rather than - // bubbling up — same silent degradation as the existing Gemini path. const captions = await captionImagesWithGemini(dir, () => {}, warnings); expect(captions).toEqual({}); @@ -89,6 +83,7 @@ describe("captionImagesWithGemini — OpenRouter provider", () => { vi.stubEnv("OPENROUTER_API_KEY", ""); vi.stubEnv("GEMINI_API_KEY", ""); vi.stubEnv("GOOGLE_API_KEY", ""); + vi.stubEnv("HYPERFRAMES_VISION_API_KEY", ""); const fetchMock = vi.fn(); vi.stubGlobal("fetch", fetchMock); @@ -100,3 +95,87 @@ describe("captionImagesWithGemini — OpenRouter provider", () => { expect(fetchMock).not.toHaveBeenCalled(); }); }); + +describe("captionImagesWithGemini — custom OpenAI-compatible endpoint", () => { + const dirs: string[] = []; + afterEach(() => { + vi.unstubAllGlobals(); + vi.unstubAllEnvs(); + for (const d of dirs) rmSync(d, { recursive: true, force: true }); + dirs.length = 0; + }); + + it("captions via custom endpoint when HYPERFRAMES_VISION_API_KEY + BASE_URL + MODEL are set", async () => { + const dir = makeProjectWithImage(); + dirs.push(dir); + vi.stubEnv("HYPERFRAMES_VISION_API_KEY", "ark-test-key"); + vi.stubEnv("HYPERFRAMES_VISION_BASE_URL", "https://ark.cn-beijing.volces.com/api/v3"); + vi.stubEnv("HYPERFRAMES_VISION_MODEL", "doubao-seed-2-0-mini-260428"); + + let capturedUrl: string | undefined; + let capturedInit: RequestInit | undefined; + const fetchMock = vi.fn(async (url: string, init?: RequestInit) => { + capturedUrl = url; + capturedInit = init; + return new Response( + JSON.stringify({ choices: [{ message: { content: "A teal portfolio site." } }] }), + { status: 200, headers: { "content-type": "application/json" } }, + ); + }); + vi.stubGlobal("fetch", fetchMock); + + const warnings: string[] = []; + const captions = await captionImagesWithGemini(dir, () => {}, warnings); + + expect(captions).toEqual({ "hero.png": "A teal portfolio site." }); + expect(warnings).toEqual([]); + expect(fetchMock).toHaveBeenCalledTimes(1); + + expect(capturedUrl).toBe("https://ark.cn-beijing.volces.com/api/v3/chat/completions"); + expect(new Headers(capturedInit?.headers).get("authorization")).toBe("Bearer ark-test-key"); + const body = JSON.parse(typeof capturedInit?.body === "string" ? capturedInit.body : "{}"); + expect(body.model).toBe("doubao-seed-2-0-mini-260428"); + }); + + it("custom endpoint takes priority over OpenRouter when both are set", async () => { + const dir = makeProjectWithImage(); + dirs.push(dir); + vi.stubEnv("HYPERFRAMES_VISION_API_KEY", "custom-key"); + vi.stubEnv("HYPERFRAMES_VISION_BASE_URL", "https://my-llm.example.com/v1"); + vi.stubEnv("HYPERFRAMES_VISION_MODEL", "my-vision-model"); + vi.stubEnv("OPENROUTER_API_KEY", "or-key-should-not-be-used"); + + let capturedUrl: string | undefined; + const fetchMock = vi.fn(async (url: string) => { + capturedUrl = url; + return new Response(JSON.stringify({ choices: [{ message: { content: "caption" } }] }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + }); + vi.stubGlobal("fetch", fetchMock); + + await captionImagesWithGemini(dir, () => {}, []); + + expect(capturedUrl).toBe("https://my-llm.example.com/v1/chat/completions"); + }); + + it("warns and skips captioning when MODEL is missing", async () => { + const dir = makeProjectWithImage(); + dirs.push(dir); + vi.stubEnv("HYPERFRAMES_VISION_API_KEY", "ark-test-key"); + vi.stubEnv("HYPERFRAMES_VISION_BASE_URL", "https://ark.cn-beijing.volces.com/api/v3"); + vi.stubEnv("HYPERFRAMES_VISION_MODEL", ""); + + const fetchMock = vi.fn(); + vi.stubGlobal("fetch", fetchMock); + + const warnings: string[] = []; + const captions = await captionImagesWithGemini(dir, () => {}, warnings); + + expect(captions).toEqual({}); + expect(fetchMock).not.toHaveBeenCalled(); + expect(warnings).toHaveLength(1); + expect(warnings[0]).toMatch(/HYPERFRAMES_VISION_MODEL/); + }); +}); diff --git a/packages/cli/src/capture/contentExtractor.ts b/packages/cli/src/capture/contentExtractor.ts index 2caee86e4..8f7e21a2c 100644 --- a/packages/cli/src/capture/contentExtractor.ts +++ b/packages/cli/src/capture/contentExtractor.ts @@ -158,10 +158,16 @@ export async function extractVisibleText(page: Page): Promise { /** * Caption downloaded images using a vision model. * - * Provider is chosen by which API key is present: OPENROUTER_API_KEY → OpenRouter - * (any vision model via its OpenAI-style API), else GEMINI_API_KEY/GOOGLE_API_KEY - * → Google Gemini, else no captioning. OpenRouter wins if both are set. + * Provider priority (first match wins): + * 1. HYPERFRAMES_VISION_API_KEY + HYPERFRAMES_VISION_BASE_URL — any OpenAI-compatible + * endpoint (Volcengine ARK, Azure OpenAI, local Ollama, self-hosted vLLM, etc.). + * Model defaults to HYPERFRAMES_VISION_MODEL env var, or the caller must set it. + * 2. OPENROUTER_API_KEY — OpenRouter proxy (any vision model via its OpenAI-style API). + * Model defaults to HYPERFRAMES_OPENROUTER_MODEL, fallback "google/gemini-3.1-flash-lite". + * 3. GEMINI_API_KEY / GOOGLE_API_KEY — Google Gemini native SDK. + * Model defaults to HYPERFRAMES_GEMINI_MODEL, fallback "gemini-3.1-flash-lite-preview". * + * When none of the above keys are present, captioning is skipped silently. * Batches requests to stay under free-tier rate limits. * Returns a map of filename -> caption string. */ @@ -171,21 +177,33 @@ export async function captionImagesWithGemini( warnings: string[], ): Promise> { const geminiCaptions: Record = {}; + + const customKey = process.env.HYPERFRAMES_VISION_API_KEY; + const customBaseUrl = process.env.HYPERFRAMES_VISION_BASE_URL; const openRouterKey = process.env.OPENROUTER_API_KEY; const geminiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY; - if (!openRouterKey && !geminiKey) return geminiCaptions; - // OpenRouter takes priority when both keys are set — it's the explicit opt-in - // for users without Google access. Both providers satisfy the same - // single-image → one-line-caption contract (`captionOne`), so the batching and - // SVG-rasterization loops below stay provider-agnostic. - const useOpenRouter = Boolean(openRouterKey); - const providerName = useOpenRouter ? "OpenRouter" : "Gemini"; - // Default mirrors the Gemini path's tier (3.x flash-lite). Override per - // provider via HYPERFRAMES_OPENROUTER_MODEL / HYPERFRAMES_GEMINI_MODEL. - const model = useOpenRouter - ? process.env.HYPERFRAMES_OPENROUTER_MODEL || "google/gemini-3.1-flash-lite" - : process.env.HYPERFRAMES_GEMINI_MODEL || "gemini-3.1-flash-lite-preview"; + if (!customKey && !openRouterKey && !geminiKey) return geminiCaptions; + + // Determine active provider and model. + let providerName: string; + let model: string; + if (customKey && customBaseUrl) { + providerName = "custom vision API"; + model = process.env.HYPERFRAMES_VISION_MODEL || ""; + if (!model) { + warnings.push( + "HYPERFRAMES_VISION_API_KEY and HYPERFRAMES_VISION_BASE_URL are set but HYPERFRAMES_VISION_MODEL is missing — skipping image captioning.", + ); + return geminiCaptions; + } + } else if (openRouterKey) { + providerName = "OpenRouter"; + model = process.env.HYPERFRAMES_OPENROUTER_MODEL || "google/gemini-3.1-flash-lite"; + } else { + providerName = "Gemini"; + model = process.env.HYPERFRAMES_GEMINI_MODEL || "gemini-3.1-flash-lite-preview"; + } progress("design", `Captioning images with ${providerName} vision...`); try { @@ -198,13 +216,15 @@ export async function captionImagesWithGemini( maxTokens: number; }) => Promise; - let captionOne: CaptionOne; - if (openRouterKey) { - captionOne = async ({ mimeType, base64, prompt, maxTokens }) => { - const res = await fetch("https://openrouter.ai/api/v1/chat/completions", { + // Shared fetch helper for any OpenAI-compatible chat/completions endpoint. + const openAiCompatCaptionOne = + (apiKey: string, baseUrl: string): CaptionOne => + async ({ mimeType, base64, prompt, maxTokens }) => { + const url = baseUrl.replace(/\/$/, "") + "/chat/completions"; + const res = await fetch(url, { method: "POST", headers: { - Authorization: `Bearer ${openRouterKey}`, + Authorization: `Bearer ${apiKey}`, "Content-Type": "application/json", }, body: JSON.stringify({ @@ -223,13 +243,21 @@ export async function captionImagesWithGemini( }); if (!res.ok) { const detail = await res.text().catch(() => ""); - throw new Error(`OpenRouter ${res.status} ${res.statusText}: ${detail.slice(0, 200)}`); + throw new Error( + `${providerName} ${res.status} ${res.statusText}: ${detail.slice(0, 200)}`, + ); } const data = (await res.json()) as { choices?: Array<{ message?: { content?: string } }>; }; return data.choices?.[0]?.message?.content?.trim() || ""; }; + + let captionOne: CaptionOne; + if (customKey && customBaseUrl) { + captionOne = openAiCompatCaptionOne(customKey, customBaseUrl); + } else if (openRouterKey) { + captionOne = openAiCompatCaptionOne(openRouterKey, "https://openrouter.ai/api/v1"); } else { // Unreachable when geminiKey is unset (guarded above); re-narrow for TS. if (!geminiKey) return geminiCaptions;