From 1d3b130afb712ae14569f0ff29a25fc9dcd358b7 Mon Sep 17 00:00:00 2001 From: default Date: Tue, 20 Jan 2026 20:25:25 +0900 Subject: [PATCH 1/2] Fix array format output for links and images The links and images formats return arrays from Firecrawl API, but writeOutput expected a string, causing endsWith() to fail. Now converts arrays to newline-separated strings for CLI-friendly output. Co-Authored-By: Claude Opus 4.5 --- src/utils/output.ts | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/utils/output.ts b/src/utils/output.ts index d913f8d17..1373d97dd 100644 --- a/src/utils/output.ts +++ b/src/utils/output.ts @@ -25,14 +25,22 @@ function extractContent(data: any, format?: ScrapeFormat): string | null { return data.markdown || data[format] || null; } - // Handle links format + // Handle links format (array of URLs -> newline-separated string) if (format === 'links') { - return data.links || data[format] || null; + const links = data.links || data[format]; + if (Array.isArray(links)) { + return links.join('\n'); + } + return links || null; } - // Handle images format + // Handle images format (array of URLs -> newline-separated string) if (format === 'images') { - return data.images || data[format] || null; + const images = data.images || data[format]; + if (Array.isArray(images)) { + return images.join('\n'); + } + return images || null; } // Handle summary format From 35941324261f386fd4821ad1b738084a17df2533 Mon Sep 17 00:00:00 2001 From: default Date: Tue, 20 Jan 2026 20:52:54 +0900 Subject: [PATCH 2/2] Add multiple format support for scrape command - Allow comma-separated formats: --format markdown,links,images - Single format outputs raw content (backward compatible) - Multiple formats output JSON with all requested data - Case-insensitive format input (rawHtml, RAWHTML both work) - Add format validation with helpful error messages - Update help text to clarify multiple format usage Co-Authored-By: Claude Opus 4.5 --- src/__tests__/commands/scrape.test.ts | 30 ++++- src/commands/scrape.ts | 33 ++++-- src/index.ts | 4 +- src/types/scrape.ts | 4 +- src/utils/options.ts | 68 ++++++++++- src/utils/output.ts | 157 ++++++++++++++++---------- 6 files changed, 213 insertions(+), 83 deletions(-) diff --git a/src/__tests__/commands/scrape.test.ts b/src/__tests__/commands/scrape.test.ts index e054f1d9b..15b284f08 100644 --- a/src/__tests__/commands/scrape.test.ts +++ b/src/__tests__/commands/scrape.test.ts @@ -63,7 +63,7 @@ describe('executeScrape', () => { await executeScrape({ url: 'https://example.com', - format: 'html', + formats: ['html'], }); expect(mockClient.scrape).toHaveBeenCalledWith('https://example.com', { @@ -97,7 +97,7 @@ describe('executeScrape', () => { await executeScrape({ url: 'https://example.com', - format: 'markdown', + formats: ['markdown'], screenshot: true, }); @@ -172,7 +172,7 @@ describe('executeScrape', () => { await executeScrape({ url: 'https://example.com', - format: 'markdown', + formats: ['markdown'], screenshot: true, onlyMainContent: true, waitFor: 3000, @@ -256,21 +256,39 @@ describe('executeScrape', () => { describe('Type safety', () => { it('should accept valid ScrapeFormat types', async () => { - const formats: Array<'markdown' | 'html' | 'rawHtml' | 'links'> = [ + const formatList: Array<'markdown' | 'html' | 'rawHtml' | 'links'> = [ 'markdown', 'html', 'rawHtml', 'links', ]; - for (const format of formats) { + for (const format of formatList) { mockClient.scrape.mockResolvedValue({ [format]: 'test' }); const result = await executeScrape({ url: 'https://example.com', - format, + formats: [format], }); expect(result.success).toBe(true); } }); + + it('should accept multiple formats', async () => { + mockClient.scrape.mockResolvedValue({ + markdown: '# Test', + links: ['http://a.com'], + images: ['http://img.com/a.png'], + }); + + const result = await executeScrape({ + url: 'https://example.com', + formats: ['markdown', 'links', 'images'], + }); + + expect(result.success).toBe(true); + expect(mockClient.scrape).toHaveBeenCalledWith('https://example.com', { + formats: ['markdown', 'links', 'images'], + }); + }); }); }); diff --git a/src/commands/scrape.ts b/src/commands/scrape.ts index dcf95e2c8..cdd5d679a 100644 --- a/src/commands/scrape.ts +++ b/src/commands/scrape.ts @@ -3,7 +3,11 @@ */ import type { FormatOption } from '@mendable/firecrawl-js'; -import type { ScrapeOptions, ScrapeResult } from '../types/scrape'; +import type { + ScrapeOptions, + ScrapeResult, + ScrapeFormat, +} from '../types/scrape'; import { getClient } from '../utils/client'; import { handleScrapeOutput } from '../utils/output'; @@ -51,15 +55,14 @@ export async function executeScrape( // Build scrape options const formats: FormatOption[] = []; - if (options.format) { - formats.push(options.format); + // Add requested formats + if (options.formats && options.formats.length > 0) { + formats.push(...options.formats); } - if (options.screenshot) { - // Add screenshot format if not already included - if (!formats.includes('screenshot')) { - formats.push('screenshot'); - } + // Add screenshot format if requested and not already included + if (options.screenshot && !formats.includes('screenshot')) { + formats.push('screenshot'); } // If no formats specified, default to markdown @@ -123,5 +126,17 @@ export async function handleScrapeCommand( options: ScrapeOptions ): Promise { const result = await executeScrape(options); - handleScrapeOutput(result, options.format, options.output, options.pretty); + + // Determine effective formats for output handling + const effectiveFormats: ScrapeFormat[] = + options.formats && options.formats.length > 0 + ? [...options.formats] + : ['markdown']; + + // Add screenshot to effective formats if it was requested separately + if (options.screenshot && !effectiveFormats.includes('screenshot')) { + effectiveFormats.push('screenshot'); + } + + handleScrapeOutput(result, effectiveFormats, options.output, options.pretty); } diff --git a/src/index.ts b/src/index.ts index fc00ffab8..e41b72b88 100644 --- a/src/index.ts +++ b/src/index.ts @@ -54,8 +54,8 @@ function createScrapeCommand(): Command { ) .option('-H, --html', 'Output raw HTML (shortcut for --format html)') .option( - '-f, --format ', - 'Output format: markdown, html, rawHtml, links, images, screenshot, summary, changeTracking, json, attributes, branding', + '-f, --format ', + 'Output format(s). Multiple formats can be specified with commas (e.g., "markdown,links,images"). Available: markdown, html, rawHtml, links, images, screenshot, summary, changeTracking, json, attributes, branding. Single format outputs raw content; multiple formats output JSON.', 'markdown' ) .option('--only-main-content', 'Include only main content', false) diff --git a/src/types/scrape.ts b/src/types/scrape.ts index dca3a18ec..2adbaf36f 100644 --- a/src/types/scrape.ts +++ b/src/types/scrape.ts @@ -18,8 +18,8 @@ export type ScrapeFormat = export interface ScrapeOptions { /** URL to scrape */ url: string; - /** Output format (markdown, html, etc.) */ - format?: ScrapeFormat; + /** Output format(s) - single format or array of formats */ + formats?: ScrapeFormat[]; /** Include only main content */ onlyMainContent?: boolean; /** Wait time before scraping (ms) */ diff --git a/src/utils/options.ts b/src/utils/options.ts index dfd1f94df..67b0166c6 100644 --- a/src/utils/options.ts +++ b/src/utils/options.ts @@ -2,15 +2,79 @@ * Option parsing utilities */ -import type { ScrapeOptions } from '../types/scrape'; +import type { ScrapeOptions, ScrapeFormat } from '../types/scrape'; + +/** + * Valid scrape format values + */ +const VALID_FORMATS: ScrapeFormat[] = [ + 'markdown', + 'html', + 'rawHtml', + 'links', + 'images', + 'screenshot', + 'summary', + 'changeTracking', + 'json', + 'attributes', + 'branding', +]; + +/** + * Map from lowercase to correct camelCase format + */ +const FORMAT_MAP: Record = Object.fromEntries( + VALID_FORMATS.map((f) => [f.toLowerCase(), f]) +) as Record; + +/** + * Parse format string into array of ScrapeFormat + * Handles comma-separated values: "markdown,links,images" + * Case-insensitive input, returns correct camelCase for API + */ +export function parseFormats(formatString: string): ScrapeFormat[] { + const inputFormats = formatString + .split(',') + .map((f) => f.trim().toLowerCase()) + .filter((f) => f.length > 0); + + // Validate and map to correct casing + const invalidFormats: string[] = []; + const validFormats: ScrapeFormat[] = []; + + for (const input of inputFormats) { + const mapped = FORMAT_MAP[input]; + if (mapped) { + validFormats.push(mapped); + } else { + invalidFormats.push(input); + } + } + + if (invalidFormats.length > 0) { + throw new Error( + `Invalid format(s): ${invalidFormats.join(', ')}. Valid formats are: ${VALID_FORMATS.join(', ')}` + ); + } + + // Remove duplicates while preserving order + return [...new Set(validFormats)]; +} /** * Convert commander options to ScrapeOptions */ export function parseScrapeOptions(options: any): ScrapeOptions { + // Parse formats from comma-separated string + let formats: ScrapeFormat[] | undefined; + if (options.format) { + formats = parseFormats(options.format); + } + return { url: options.url, - format: options.format, + formats, onlyMainContent: options.onlyMainContent, waitFor: options.waitFor, screenshot: options.screenshot, diff --git a/src/utils/output.ts b/src/utils/output.ts index 1373d97dd..cb822fe3f 100644 --- a/src/utils/output.ts +++ b/src/utils/output.ts @@ -4,62 +4,91 @@ import * as fs from 'fs'; import * as path from 'path'; -import type { ScrapeResult } from '../types/scrape'; -import type { ScrapeFormat } from '../types/scrape'; +import type { ScrapeResult, ScrapeFormat } from '../types/scrape'; + +/** + * Text formats that can be output as raw content (curl-like) + */ +const RAW_TEXT_FORMATS: ScrapeFormat[] = [ + 'html', + 'rawHtml', + 'markdown', + 'links', + 'images', + 'summary', +]; /** * Extract content from Firecrawl Document based on format */ -function extractContent(data: any, format?: ScrapeFormat): string | null { +function extractContent(data: any, format: ScrapeFormat): string | null { if (!data) return null; - // If format is specified, try to extract that specific content - if (format) { - // Handle html/rawHtml formats - extract HTML content directly - if (format === 'html' || format === 'rawHtml') { - return data.html || data.rawHtml || data[format] || null; - } + // Handle html/rawHtml formats - extract HTML content directly + if (format === 'html' || format === 'rawHtml') { + return data.html || data.rawHtml || data[format] || null; + } - // Handle markdown format - if (format === 'markdown') { - return data.markdown || data[format] || null; - } + // Handle markdown format + if (format === 'markdown') { + return data.markdown || data[format] || null; + } - // Handle links format (array of URLs -> newline-separated string) - if (format === 'links') { - const links = data.links || data[format]; - if (Array.isArray(links)) { - return links.join('\n'); - } - return links || null; + // Handle links format (array of URLs -> newline-separated string) + if (format === 'links') { + const links = data.links || data[format]; + if (Array.isArray(links)) { + return links.join('\n'); } + return links || null; + } - // Handle images format (array of URLs -> newline-separated string) - if (format === 'images') { - const images = data.images || data[format]; - if (Array.isArray(images)) { - return images.join('\n'); - } - return images || null; + // Handle images format (array of URLs -> newline-separated string) + if (format === 'images') { + const images = data.images || data[format]; + if (Array.isArray(images)) { + return images.join('\n'); } + return images || null; + } - // Handle summary format - if (format === 'summary') { - return data.summary || data[format] || null; - } + // Handle summary format + if (format === 'summary') { + return data.summary || data[format] || null; } - // Fallback: try common content fields - if (typeof data === 'string') { - return data; + return null; +} + +/** + * Extract multiple format contents from response data + */ +function extractMultipleFormats( + data: any, + formats: ScrapeFormat[] +): Record { + const result: Record = {}; + + for (const format of formats) { + const key = format; + + if (data[key] !== undefined) { + result[key] = data[key]; + } else if (format === 'html' && data.rawHtml !== undefined) { + // Fallback for html -> rawHtml + result[key] = data.rawHtml; + } else if (format === 'rawHtml' && data.html !== undefined) { + // Fallback for rawHtml -> html + result[key] = data.html; + } } - // If it's an object, try to find string content - if (typeof data === 'object') { - return data.html || data.markdown || data.rawHtml || data.content || null; + // Always include metadata if present + if (data.metadata) { + result.metadata = data.metadata; } - return null; + return result; } /** @@ -92,12 +121,15 @@ export function writeOutput( /** * Handle scrape result output - * For text formats (html, markdown, etc.), outputs raw content directly - * For complex formats, outputs JSON + * + * Output behavior: + * - Single text format (html, markdown, links, images, summary, rawHtml): raw content + * - Single complex format (screenshot, json, branding, etc.): JSON output + * - Multiple formats: JSON with all requested data */ export function handleScrapeOutput( result: ScrapeResult, - format?: ScrapeFormat, + formats: ScrapeFormat[], outputPath?: string, pretty: boolean = false ): void { @@ -111,42 +143,43 @@ export function handleScrapeOutput( return; } - // Text formats that should output raw content (curl-like) - const rawTextFormats: ScrapeFormat[] = [ - 'html', - 'rawHtml', - 'markdown', - 'links', - 'images', - 'summary', - ]; - const shouldOutputRaw = format && rawTextFormats.includes(format); - - if (shouldOutputRaw) { - // Extract and output raw content - const content = extractContent(result.data, format); + // Determine output mode based on number of formats + const isSingleFormat = formats.length === 1; + const singleFormat = isSingleFormat ? formats[0] : null; + const isRawTextFormat = + singleFormat && RAW_TEXT_FORMATS.includes(singleFormat); + + // Single raw text format: output raw content (current behavior) + if (isSingleFormat && isRawTextFormat && singleFormat) { + const content = extractContent(result.data, singleFormat); if (content !== null) { writeOutput(content, outputPath, !!outputPath); return; } } - // For JSON format or complex formats (branding, json, etc.), output clean JSON - // Always stringify the entire data object to ensure valid JSON + // Multiple formats or complex format: output JSON + let outputData: any; + + if (isSingleFormat) { + // Single complex format - output entire data object + outputData = result.data; + } else { + // Multiple formats - extract only requested formats + outputData = extractMultipleFormats(result.data, formats); + } + let jsonContent: string; try { jsonContent = pretty - ? JSON.stringify(result.data, null, 2) - : JSON.stringify(result.data); + ? JSON.stringify(outputData, null, 2) + : JSON.stringify(outputData); } catch (error) { - // If stringification fails, try to create a minimal error response jsonContent = JSON.stringify({ error: 'Failed to serialize response', message: error instanceof Error ? error.message : 'Unknown error', }); } - // Ensure clean JSON output (no extra newlines or text before JSON) - // Output directly to stdout without any prefix writeOutput(jsonContent, outputPath, !!outputPath); }