Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 88 additions & 9 deletions packages/cli/src/capture/contentExtractor.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,9 @@ import { tmpdir } from "node:os";
import { join } from "node:path";
import { captionImagesWithGemini } from "./contentExtractor.js";

// These tests exercise the OpenRouter provider path only — it makes a plain
// `fetch` call we can stub, with no native (`sharp`) or `@google/genai`
// dependency. OpenRouter wins over Gemini when OPENROUTER_API_KEY is set, so we
// don't need to clear the Gemini keys for the OpenRouter cases.
// These tests exercise the OpenRouter and custom-endpoint provider paths only —
// both make a plain `fetch` call we can stub, with no native (`sharp`) or
// `@google/genai` dependency.

function makeProjectWithImage(): string {
const dir = mkdtempSync(join(tmpdir(), "hf-caption-"));
Expand All @@ -33,8 +32,6 @@ describe("captionImagesWithGemini — OpenRouter provider", () => {
vi.stubEnv("OPENROUTER_API_KEY", "or-test-key");
vi.stubEnv("HYPERFRAMES_OPENROUTER_MODEL", "google/gemini-3.1-flash-lite");

// Capture the request inside the mock, where the args are well-typed —
// avoids casting `mock.calls` (and the repo's ban on `as` assertions).
let capturedUrl: string | undefined;
let capturedInit: RequestInit | undefined;
const fetchMock = vi.fn(async (url: string, init?: RequestInit) => {
Expand Down Expand Up @@ -75,9 +72,6 @@ describe("captionImagesWithGemini — OpenRouter provider", () => {
);

const warnings: string[] = [];
// captionOne throws on !res.ok, but the throw is per-image inside
// Promise.allSettled, so it's filtered out as a rejected result rather than
// bubbling up — same silent degradation as the existing Gemini path.
const captions = await captionImagesWithGemini(dir, () => {}, warnings);

expect(captions).toEqual({});
Expand All @@ -89,6 +83,7 @@ describe("captionImagesWithGemini — OpenRouter provider", () => {
vi.stubEnv("OPENROUTER_API_KEY", "");
vi.stubEnv("GEMINI_API_KEY", "");
vi.stubEnv("GOOGLE_API_KEY", "");
vi.stubEnv("HYPERFRAMES_VISION_API_KEY", "");

const fetchMock = vi.fn();
vi.stubGlobal("fetch", fetchMock);
Expand All @@ -100,3 +95,87 @@ describe("captionImagesWithGemini — OpenRouter provider", () => {
expect(fetchMock).not.toHaveBeenCalled();
});
});

describe("captionImagesWithGemini — custom OpenAI-compatible endpoint", () => {
const dirs: string[] = [];
afterEach(() => {
vi.unstubAllGlobals();
vi.unstubAllEnvs();
for (const d of dirs) rmSync(d, { recursive: true, force: true });
dirs.length = 0;
});

it("captions via custom endpoint when HYPERFRAMES_VISION_API_KEY + BASE_URL + MODEL are set", async () => {
const dir = makeProjectWithImage();
dirs.push(dir);
vi.stubEnv("HYPERFRAMES_VISION_API_KEY", "ark-test-key");
vi.stubEnv("HYPERFRAMES_VISION_BASE_URL", "https://ark.cn-beijing.volces.com/api/v3");
vi.stubEnv("HYPERFRAMES_VISION_MODEL", "doubao-seed-2-0-mini-260428");

let capturedUrl: string | undefined;
let capturedInit: RequestInit | undefined;
const fetchMock = vi.fn(async (url: string, init?: RequestInit) => {
capturedUrl = url;
capturedInit = init;
return new Response(
JSON.stringify({ choices: [{ message: { content: "A teal portfolio site." } }] }),
{ status: 200, headers: { "content-type": "application/json" } },
);
});
vi.stubGlobal("fetch", fetchMock);

const warnings: string[] = [];
const captions = await captionImagesWithGemini(dir, () => {}, warnings);

expect(captions).toEqual({ "hero.png": "A teal portfolio site." });
expect(warnings).toEqual([]);
expect(fetchMock).toHaveBeenCalledTimes(1);

expect(capturedUrl).toBe("https://ark.cn-beijing.volces.com/api/v3/chat/completions");
expect(new Headers(capturedInit?.headers).get("authorization")).toBe("Bearer ark-test-key");
const body = JSON.parse(typeof capturedInit?.body === "string" ? capturedInit.body : "{}");
expect(body.model).toBe("doubao-seed-2-0-mini-260428");
});

it("custom endpoint takes priority over OpenRouter when both are set", async () => {
const dir = makeProjectWithImage();
dirs.push(dir);
vi.stubEnv("HYPERFRAMES_VISION_API_KEY", "custom-key");
vi.stubEnv("HYPERFRAMES_VISION_BASE_URL", "https://my-llm.example.com/v1");
vi.stubEnv("HYPERFRAMES_VISION_MODEL", "my-vision-model");
vi.stubEnv("OPENROUTER_API_KEY", "or-key-should-not-be-used");

let capturedUrl: string | undefined;
const fetchMock = vi.fn(async (url: string) => {
capturedUrl = url;
return new Response(JSON.stringify({ choices: [{ message: { content: "caption" } }] }), {
status: 200,
headers: { "content-type": "application/json" },
});
});
vi.stubGlobal("fetch", fetchMock);

await captionImagesWithGemini(dir, () => {}, []);

expect(capturedUrl).toBe("https://my-llm.example.com/v1/chat/completions");
});

it("warns and skips captioning when MODEL is missing", async () => {
const dir = makeProjectWithImage();
dirs.push(dir);
vi.stubEnv("HYPERFRAMES_VISION_API_KEY", "ark-test-key");
vi.stubEnv("HYPERFRAMES_VISION_BASE_URL", "https://ark.cn-beijing.volces.com/api/v3");
vi.stubEnv("HYPERFRAMES_VISION_MODEL", "");

const fetchMock = vi.fn();
vi.stubGlobal("fetch", fetchMock);

const warnings: string[] = [];
const captions = await captionImagesWithGemini(dir, () => {}, warnings);

expect(captions).toEqual({});
expect(fetchMock).not.toHaveBeenCalled();
expect(warnings).toHaveLength(1);
expect(warnings[0]).toMatch(/HYPERFRAMES_VISION_MODEL/);
});
});
70 changes: 49 additions & 21 deletions packages/cli/src/capture/contentExtractor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -158,10 +158,16 @@ export async function extractVisibleText(page: Page): Promise<string> {
/**
* Caption downloaded images using a vision model.
*
* Provider is chosen by which API key is present: OPENROUTER_API_KEY → OpenRouter
* (any vision model via its OpenAI-style API), else GEMINI_API_KEY/GOOGLE_API_KEY
* → Google Gemini, else no captioning. OpenRouter wins if both are set.
* Provider priority (first match wins):
* 1. HYPERFRAMES_VISION_API_KEY + HYPERFRAMES_VISION_BASE_URL — any OpenAI-compatible
* endpoint (Volcengine ARK, Azure OpenAI, local Ollama, self-hosted vLLM, etc.).
* Model defaults to HYPERFRAMES_VISION_MODEL env var, or the caller must set it.
* 2. OPENROUTER_API_KEY — OpenRouter proxy (any vision model via its OpenAI-style API).
* Model defaults to HYPERFRAMES_OPENROUTER_MODEL, fallback "google/gemini-3.1-flash-lite".
* 3. GEMINI_API_KEY / GOOGLE_API_KEY — Google Gemini native SDK.
* Model defaults to HYPERFRAMES_GEMINI_MODEL, fallback "gemini-3.1-flash-lite-preview".
*
* When none of the above keys are present, captioning is skipped silently.
* Batches requests to stay under free-tier rate limits.
* Returns a map of filename -> caption string.
*/
Expand All @@ -171,21 +177,33 @@ export async function captionImagesWithGemini(
warnings: string[],
): Promise<Record<string, string>> {
const geminiCaptions: Record<string, string> = {};

const customKey = process.env.HYPERFRAMES_VISION_API_KEY;
const customBaseUrl = process.env.HYPERFRAMES_VISION_BASE_URL;
const openRouterKey = process.env.OPENROUTER_API_KEY;
const geminiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY;
if (!openRouterKey && !geminiKey) return geminiCaptions;

// OpenRouter takes priority when both keys are set — it's the explicit opt-in
// for users without Google access. Both providers satisfy the same
// single-image → one-line-caption contract (`captionOne`), so the batching and
// SVG-rasterization loops below stay provider-agnostic.
const useOpenRouter = Boolean(openRouterKey);
const providerName = useOpenRouter ? "OpenRouter" : "Gemini";
// Default mirrors the Gemini path's tier (3.x flash-lite). Override per
// provider via HYPERFRAMES_OPENROUTER_MODEL / HYPERFRAMES_GEMINI_MODEL.
const model = useOpenRouter
? process.env.HYPERFRAMES_OPENROUTER_MODEL || "google/gemini-3.1-flash-lite"
: process.env.HYPERFRAMES_GEMINI_MODEL || "gemini-3.1-flash-lite-preview";
if (!customKey && !openRouterKey && !geminiKey) return geminiCaptions;

// Determine active provider and model.
let providerName: string;
let model: string;
if (customKey && customBaseUrl) {
providerName = "custom vision API";
model = process.env.HYPERFRAMES_VISION_MODEL || "";
if (!model) {
warnings.push(
"HYPERFRAMES_VISION_API_KEY and HYPERFRAMES_VISION_BASE_URL are set but HYPERFRAMES_VISION_MODEL is missing — skipping image captioning.",
);
return geminiCaptions;
}
} else if (openRouterKey) {
providerName = "OpenRouter";
model = process.env.HYPERFRAMES_OPENROUTER_MODEL || "google/gemini-3.1-flash-lite";
} else {
providerName = "Gemini";
model = process.env.HYPERFRAMES_GEMINI_MODEL || "gemini-3.1-flash-lite-preview";
}

progress("design", `Captioning images with ${providerName} vision...`);
try {
Expand All @@ -198,13 +216,15 @@ export async function captionImagesWithGemini(
maxTokens: number;
}) => Promise<string>;

let captionOne: CaptionOne;
if (openRouterKey) {
captionOne = async ({ mimeType, base64, prompt, maxTokens }) => {
const res = await fetch("https://openrouter.ai/api/v1/chat/completions", {
// Shared fetch helper for any OpenAI-compatible chat/completions endpoint.
const openAiCompatCaptionOne =
(apiKey: string, baseUrl: string): CaptionOne =>
async ({ mimeType, base64, prompt, maxTokens }) => {
const url = baseUrl.replace(/\/$/, "") + "/chat/completions";
const res = await fetch(url, {
method: "POST",
headers: {
Authorization: `Bearer ${openRouterKey}`,
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
Expand All @@ -223,13 +243,21 @@ export async function captionImagesWithGemini(
});
if (!res.ok) {
const detail = await res.text().catch(() => "");
throw new Error(`OpenRouter ${res.status} ${res.statusText}: ${detail.slice(0, 200)}`);
throw new Error(
`${providerName} ${res.status} ${res.statusText}: ${detail.slice(0, 200)}`,
);
}
const data = (await res.json()) as {
choices?: Array<{ message?: { content?: string } }>;
};
return data.choices?.[0]?.message?.content?.trim() || "";
};

let captionOne: CaptionOne;
if (customKey && customBaseUrl) {
captionOne = openAiCompatCaptionOne(customKey, customBaseUrl);
} else if (openRouterKey) {
captionOne = openAiCompatCaptionOne(openRouterKey, "https://openrouter.ai/api/v1");
} else {
// Unreachable when geminiKey is unset (guarded above); re-narrow for TS.
if (!geminiKey) return geminiCaptions;
Expand Down