From 16e09f63394359b83eff2c64cb4394214210b6be Mon Sep 17 00:00:00 2001 From: mogery Date: Tue, 21 Apr 2026 18:15:39 +0100 Subject: [PATCH 01/10] feat: parse command --- src/commands/parse.ts | 239 ++++++++++++++++++++++++++++++++++++++++++ src/index.ts | 94 +++++++++++++++++ src/types/parse.ts | 42 ++++++++ 3 files changed, 375 insertions(+) create mode 100644 src/commands/parse.ts create mode 100644 src/types/parse.ts diff --git a/src/commands/parse.ts b/src/commands/parse.ts new file mode 100644 index 000000000..1fd481fc9 --- /dev/null +++ b/src/commands/parse.ts @@ -0,0 +1,239 @@ +/** + * Parse command implementation + * + * Uploads a local file to the Firecrawl /v2/parse endpoint and returns the + * parsed document in the requested format(s). Supported file types: + * .html, .htm, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import type { FormatOption } from '@mendable/firecrawl-js'; +import type { ParseOptions, ParseResult } from '../types/parse'; +import type { ScrapeFormat } from '../types/scrape'; +import { getClient } from '../utils/client'; +import { getConfig, validateConfig } from '../utils/config'; +import { handleScrapeOutput } from '../utils/output'; + +const DEFAULT_API_URL = 'https://api.firecrawl.dev'; + +/** File extensions accepted by /v2/parse (mirrors the API controller). */ +const SUPPORTED_EXTENSIONS = new Set([ + '.html', + '.htm', + '.pdf', + '.docx', + '.doc', + '.odt', + '.rtf', + '.xlsx', + '.xls', +]); + +/** + * Best-effort content-type lookup so the API's kind detector has a hint + * even if the extension is ambiguous. + */ +const CONTENT_TYPE_BY_EXT: Record = { + '.html': 'text/html', + '.htm': 'text/html', + '.pdf': 'application/pdf', + '.docx': + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + '.doc': 'application/msword', + '.odt': 'application/vnd.oasis.opendocument.text', + '.rtf': 'application/rtf', + '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + '.xls': 'application/vnd.ms-excel', +}; + +function outputTiming( + options: ParseOptions, + requestStartTime: number, + requestEndTime: number, + error?: Error | unknown +): void { + if (!options.timing) return; + + const duration = requestEndTime - requestStartTime; + const info: Record = { + file: options.file, + requestTime: new Date(requestStartTime).toISOString(), + duration: `${duration}ms`, + status: error ? 'error' : 'success', + }; + if (error) { + info.error = error instanceof Error ? error.message : 'Unknown error'; + } + console.error('Timing:', JSON.stringify(info, null, 2)); +} + +/** + * Build the `formats` array sent to the API (mirrors scrape's behavior). + */ +function buildFormats(options: ParseOptions): FormatOption[] { + const formats: FormatOption[] = []; + + if (options.formats && options.formats.length > 0) { + formats.push(...options.formats); + } + + if (options.query) { + formats.push({ type: 'query', prompt: options.query } as any); + } + + if (formats.length === 0) { + formats.push('markdown'); + } + + return formats; +} + +/** + * Build the JSON `options` payload uploaded alongside the file. + */ +function buildOptionsPayload(options: ParseOptions): Record { + const payload: Record = { + formats: buildFormats(options), + integration: 'cli', + }; + + if (options.onlyMainContent !== undefined) { + payload.onlyMainContent = options.onlyMainContent; + } + if (options.includeTags && options.includeTags.length > 0) { + payload.includeTags = options.includeTags; + } + if (options.excludeTags && options.excludeTags.length > 0) { + payload.excludeTags = options.excludeTags; + } + if (options.timeout !== undefined) { + payload.timeout = options.timeout; + } + if (options.location) { + payload.location = options.location; + } + + return payload; +} + +/** + * Execute the parse command by POSTing a multipart upload to /v2/parse. + */ +export async function executeParse( + options: ParseOptions +): Promise { + const filePath = path.resolve(options.file); + + if (!fs.existsSync(filePath)) { + return { + success: false, + error: `File not found: ${options.file}`, + }; + } + + const stat = fs.statSync(filePath); + if (!stat.isFile()) { + return { + success: false, + error: `Not a file: ${options.file}`, + }; + } + + const ext = path.extname(filePath).toLowerCase(); + if (!SUPPORTED_EXTENSIONS.has(ext)) { + return { + success: false, + error: + `Unsupported file type "${ext || '(none)'}". ` + + `Supported extensions: ${[...SUPPORTED_EXTENSIONS].join(', ')}`, + }; + } + + // Ensure auth/url is resolved through the same config pipeline scrape uses. + if (options.apiKey || options.apiUrl) { + getClient({ apiKey: options.apiKey, apiUrl: options.apiUrl }); + } + + const config = getConfig(); + const apiKey = options.apiKey || config.apiKey; + validateConfig(apiKey); + + const apiUrl = (options.apiUrl || config.apiUrl || DEFAULT_API_URL).replace( + /\/$/, + '' + ); + + const buffer = fs.readFileSync(filePath); + const filename = path.basename(filePath); + const contentType = CONTENT_TYPE_BY_EXT[ext] ?? 'application/octet-stream'; + + const form = new FormData(); + form.append( + 'file', + new Blob([new Uint8Array(buffer)], { type: contentType }), + filename + ); + form.append('options', JSON.stringify(buildOptionsPayload(options))); + + const requestStartTime = Date.now(); + + try { + const response = await fetch(`${apiUrl}/v2/parse`, { + method: 'POST', + headers: apiKey ? { Authorization: `Bearer ${apiKey}` } : undefined, + body: form, + }); + + const requestEndTime = Date.now(); + outputTiming(options, requestStartTime, requestEndTime); + + const payload = (await response.json().catch(() => ({}))) as any; + + if (!response.ok || payload?.success === false) { + const message = + payload?.error || + `HTTP ${response.status}: ${response.statusText || 'Request failed'}`; + return { success: false, error: message }; + } + + return { + success: true, + data: payload?.data ?? payload, + }; + } catch (error) { + const requestEndTime = Date.now(); + outputTiming(options, requestStartTime, requestEndTime, error); + return { + success: false, + error: error instanceof Error ? error.message : 'Unknown error occurred', + }; + } +} + +/** + * Handle parse command output. Reuses the scrape output formatter since the + * /v2/parse response shape matches /v2/scrape. + */ +export async function handleParseCommand(options: ParseOptions): Promise { + const result = await executeParse(options); + + if (options.query && result.success && result.data?.answer) { + const { writeOutput } = await import('../utils/output'); + writeOutput(result.data.answer, options.output, !!options.output); + return; + } + + const effectiveFormats: ScrapeFormat[] = + options.formats && options.formats.length > 0 + ? [...options.formats] + : ['markdown']; + + handleScrapeOutput( + result, + effectiveFormats, + options.output, + options.pretty, + options.json + ); +} diff --git a/src/index.ts b/src/index.ts index 1faec0138..87076889b 100644 --- a/src/index.ts +++ b/src/index.ts @@ -16,6 +16,7 @@ import { configure, viewConfig } from './commands/config'; import { handleCreditUsageCommand } from './commands/credit-usage'; import { handleCrawlCommand } from './commands/crawl'; import { handleMapCommand } from './commands/map'; +import { handleParseCommand } from './commands/parse'; import { handleSearchCommand } from './commands/search'; import { handleAgentCommand } from './commands/agent'; import { @@ -61,6 +62,7 @@ const AUTH_REQUIRED_COMMANDS = [ 'download', 'crawl', 'map', + 'parse', 'search', 'agent', 'browser', @@ -498,6 +500,97 @@ function createMapCommand(): Command { return mapCmd; } +/** + * Create and configure the parse command + */ +function createParseCommand(): Command { + const parseCmd = new Command('parse') + .description( + 'Parse a local file (HTML, PDF, DOCX, DOC, ODT, RTF, XLSX, XLS) into markdown, HTML, links, JSON, and more. Uses /v2/parse.' + ) + .argument('', 'Path to the local file to parse') + .option('-H, --html', 'Output raw HTML (shortcut for --format html)') + .option( + '-f, --format ', + 'Output format(s). Multiple formats can be specified with commas (e.g., "markdown,links"). Available: markdown, html, rawHtml, links, images, summary, json, attributes. Single format outputs raw content; multiple formats output JSON.' + ) + .option('--only-main-content', 'Include only main content', false) + .option('-S, --summary', 'Output summary (shortcut for --format summary)') + .option('--include-tags ', 'Comma-separated list of tags to include') + .option('--exclude-tags ', 'Comma-separated list of tags to exclude') + .option( + '--timeout ', + 'Timeout in milliseconds for the parse job', + parseInt + ) + .option( + '-Q, --query ', + 'Ask a question about the parsed content (query format)' + ) + .option( + '-k, --api-key ', + 'Firecrawl API key (overrides global --api-key)' + ) + .option('--api-url ', 'API URL (overrides global --api-url)') + .option('-o, --output ', 'Output file path (default: stdout)') + .option('--json', 'Output as JSON format', false) + .option('--pretty', 'Pretty print JSON output', false) + .option( + '--timing', + 'Show request timing and other useful information', + false + ) + .addHelpText( + 'after', + ` +Examples: + $ firecrawl parse ./report.pdf + $ firecrawl parse ./report.pdf -f markdown,links + $ firecrawl parse ./page.html -H + $ firecrawl parse ./contract.docx --only-main-content + $ firecrawl parse ./report.pdf -Q "What is the total revenue?" + $ firecrawl parse ./report.pdf --json --pretty -o report.json + +Supported file types: .html, .htm, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls +Max upload size: 50 MB +` + ) + .action(async (file: string, options) => { + let format: string | undefined; + if (options.html) { + format = 'html'; + } else if (options.summary) { + format = 'summary'; + } else if (options.format) { + format = options.format; + } + + const scrapeOptions = parseScrapeOptions({ + ...options, + url: 'file://' + file, + format: format ?? 'markdown', + }); + + await handleParseCommand({ + file, + formats: scrapeOptions.formats, + onlyMainContent: scrapeOptions.onlyMainContent, + includeTags: scrapeOptions.includeTags, + excludeTags: scrapeOptions.excludeTags, + timeout: options.timeout, + apiKey: options.apiKey, + apiUrl: options.apiUrl, + output: options.output, + pretty: options.pretty, + json: options.json, + timing: options.timing, + query: options.query, + }); + }); + + return parseCmd; +} + /** * Create and configure the search command */ @@ -1185,6 +1278,7 @@ Examples: // Add core commands to main program program.addCommand(createCrawlCommand()); program.addCommand(createMapCommand()); +program.addCommand(createParseCommand()); program.addCommand(createSearchCommand()); program.addCommand(createAgentCommand()); program.addCommand(createInteractCommand()); diff --git a/src/types/parse.ts b/src/types/parse.ts new file mode 100644 index 000000000..7499ef541 --- /dev/null +++ b/src/types/parse.ts @@ -0,0 +1,42 @@ +/** + * Types and interfaces for the parse command + */ + +import type { ScrapeFormat, ScrapeLocation } from './scrape'; + +export interface ParseOptions { + /** Local file path to parse */ + file: string; + /** Output format(s) */ + formats?: ScrapeFormat[]; + /** Include only main content */ + onlyMainContent?: boolean; + /** Include tags */ + includeTags?: string[]; + /** Exclude tags */ + excludeTags?: string[]; + /** Timeout in milliseconds for the parse job */ + timeout?: number; + /** API key for Firecrawl */ + apiKey?: string; + /** API URL for Firecrawl */ + apiUrl?: string; + /** Output file path */ + output?: string; + /** Pretty print JSON output */ + pretty?: boolean; + /** Force JSON output */ + json?: boolean; + /** Show request timing */ + timing?: boolean; + /** Location for geo-targeted parsing (typically unused for local files) */ + location?: ScrapeLocation; + /** Ask a question about the parsed content (query format) */ + query?: string; +} + +export interface ParseResult { + success: boolean; + data?: any; + error?: string; +} From 198b539648d7f0fd40ee7bb03400974468127aad Mon Sep 17 00:00:00 2001 From: Developers Digest <124798203+developersdigest@users.noreply.github.com> Date: Wed, 22 Apr 2026 09:21:01 -0400 Subject: [PATCH 02/10] add firecrawl-parse skill covers local file parsing (pdf/docx/xlsx/etc) with summary and query shortcuts. registers in firecrawl-cli hub. --- skills/firecrawl-cli/SKILL.md | 2 + skills/firecrawl-parse/SKILL.md | 69 +++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 skills/firecrawl-parse/SKILL.md diff --git a/skills/firecrawl-cli/SKILL.md b/skills/firecrawl-cli/SKILL.md index 4664c6207..8d7e6a7c7 100644 --- a/skills/firecrawl-cli/SKILL.md +++ b/skills/firecrawl-cli/SKILL.md @@ -62,6 +62,7 @@ Follow this escalation pattern: | AI-powered data extraction | `agent` | Need structured data from complex sites | | Interact with a page | `scrape` + `interact` | Content requires clicks, form fills, pagination, or login | | Download a site to files | `download` | Save an entire site as local files | +| Parse a local file | `parse` | File on disk (PDF, DOCX, XLSX, etc.) — not a URL | For detailed command reference, run `firecrawl --help`. @@ -85,6 +86,7 @@ For detailed command reference, run `firecrawl --help`. - **AI-powered structured extraction from complex sites** -> [firecrawl-agent](../firecrawl-agent/SKILL.md) - **Clicks, forms, login, pagination, or post-scrape browser actions** -> [firecrawl-interact](../firecrawl-interact/SKILL.md) - **Downloading a site to local files** -> [firecrawl-download](../firecrawl-download/SKILL.md) +- **Parsing a local file (PDF, DOCX, XLSX, HTML, etc.)** -> [firecrawl-parse](../firecrawl-parse/SKILL.md) - **Install, auth, or setup problems** -> [rules/install.md](rules/install.md) - **Output handling and safe file-reading patterns** -> [rules/security.md](rules/security.md) - **Integrating Firecrawl into an app, adding `FIRECRAWL_API_KEY` to `.env`, or choosing endpoint usage in product code** -> use the `firecrawl-build` skills (already installed alongside this CLI skill) diff --git a/skills/firecrawl-parse/SKILL.md b/skills/firecrawl-parse/SKILL.md new file mode 100644 index 000000000..3a38b5711 --- /dev/null +++ b/skills/firecrawl-parse/SKILL.md @@ -0,0 +1,69 @@ +--- +name: firecrawl-parse +description: | + Convert a local file (PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, HTML) into clean markdown, HTML, or structured JSON. Use this skill when the user points at a file on disk and wants its content extracted — says "parse this PDF", "convert this Word doc", "read this file", "extract text from", "PDF to markdown", "DOCX to markdown", or provides a local path (not a URL). Also supports AI summary and query ("what does this PDF say about X?"). Use this instead of `scrape` for anything on the local filesystem. +allowed-tools: + - Bash(firecrawl *) + - Bash(npx firecrawl *) +--- + +# firecrawl parse + +Parse a local file into clean, LLM-optimized markdown. Supported formats: **HTML, PDF, DOCX, DOC, ODT, RTF, XLSX, XLS**. + +## When to use + +- You have a file on disk (not a URL) and want its text +- User drops a PDF/DOCX and asks what it says, or to summarize it +- You need markdown from a Word doc, spreadsheet, or PDF to feed into other tools +- Use `scrape` instead when the source is a URL + +## Quick start + +```bash +# Basic — PDF/DOCX/etc. to markdown +firecrawl parse ./paper.pdf -o .firecrawl/paper.md + +# Summary shortcut (AI-generated overview) +firecrawl parse ./paper.pdf -S -o .firecrawl/summary.md + +# Ask a question about the doc +firecrawl parse ./paper.pdf -Q "What are the main conclusions?" + +# Multiple formats → JSON bundle (markdown + links + summary + metadata) +firecrawl parse ./paper.pdf -f markdown,links,summary --pretty -o .firecrawl/paper.json + +# Raw HTML output +firecrawl parse ./paper.pdf -H -o .firecrawl/paper.html +``` + +## Options + +| Option | Description | +| ------------------------ | --------------------------------------------------------------------------------------------------- | +| `-f, --format ` | Output formats (comma-separated): markdown, html, rawHtml, links, images, summary, json, attributes | +| `-S, --summary` | Shortcut for `--format summary` (AI summary) | +| `-H, --html` | Shortcut for `--format html` (raw HTML) | +| `-Q, --query ` | Ask a question about the parsed content | +| `--only-main-content` | Strip boilerplate, main content only | +| `--include-tags ` | Only include these HTML tags | +| `--exclude-tags ` | Exclude these HTML tags | +| `--timeout ` | Timeout for the parse job | +| `-o, --output ` | Output file path (default: stdout) | +| `--json` | Force JSON output | +| `--pretty` | Pretty-print JSON | +| `--timing` | Show request duration | + +## Tips + +- **Scrape vs parse**: `scrape` takes a URL, `parse` takes a local file path. A remote PDF URL can still go through `scrape`. +- **Single vs multi format**: one `--format` value returns raw content; multiple return JSON with keys for each format. +- **Quote paths with spaces**: `firecrawl parse "./My Doc.pdf"`. +- **PDFs may return empty `links`/`images`** — PDF structure doesn't always carry link/image metadata like HTML does. That's expected, not a failure. +- **Large docs**: parse time scales with file size. A ~50-page PDF takes ~10s. Use `--timing` to check. +- **Query vs save-and-grep**: `-Q` is convenient for single questions. For deeper analysis, save to file first, then `grep`/read the markdown. + +## See also + +- [firecrawl-scrape](../firecrawl-scrape/SKILL.md) — same idea but for URLs +- [firecrawl-download](../firecrawl-download/SKILL.md) — bulk save a site as local files (which you can then parse) From 031020fa243ab24784e27fb2f87560c1099926db Mon Sep 17 00:00:00 2001 From: Developers Digest <124798203+developersdigest@users.noreply.github.com> Date: Wed, 22 Apr 2026 09:28:08 -0400 Subject: [PATCH 03/10] lead parse skill with markdown, demote format matrix markdown is the 90% use case. moved formats table to secondary 'other formats' section. kept format availability caveat (links/images empty for pdfs). --- skills/firecrawl-parse/SKILL.md | 66 +++++++++++++++++---------------- 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/skills/firecrawl-parse/SKILL.md b/skills/firecrawl-parse/SKILL.md index 3a38b5711..c3f78568d 100644 --- a/skills/firecrawl-parse/SKILL.md +++ b/skills/firecrawl-parse/SKILL.md @@ -1,7 +1,7 @@ --- name: firecrawl-parse description: | - Convert a local file (PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, HTML) into clean markdown, HTML, or structured JSON. Use this skill when the user points at a file on disk and wants its content extracted — says "parse this PDF", "convert this Word doc", "read this file", "extract text from", "PDF to markdown", "DOCX to markdown", or provides a local path (not a URL). Also supports AI summary and query ("what does this PDF say about X?"). Use this instead of `scrape` for anything on the local filesystem. + Convert a local file (PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, HTML) into clean markdown. Use this skill when the user points at a file on disk and wants its content — says "parse this PDF", "convert this Word doc", "read this file", "extract text from", "PDF to markdown", "DOCX to markdown", or provides a local path (not a URL). Also supports AI summary and Q&A. Use this instead of `scrape` for anything on the local filesystem. allowed-tools: - Bash(firecrawl *) - Bash(npx firecrawl *) @@ -9,61 +9,65 @@ allowed-tools: # firecrawl parse -Parse a local file into clean, LLM-optimized markdown. Supported formats: **HTML, PDF, DOCX, DOC, ODT, RTF, XLSX, XLS**. +Turn any local document into clean markdown. Supported file types: **PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, HTML**. ## When to use -- You have a file on disk (not a URL) and want its text +- You have a file on disk (not a URL) and want its text as markdown - User drops a PDF/DOCX and asks what it says, or to summarize it -- You need markdown from a Word doc, spreadsheet, or PDF to feed into other tools +- You need a Word doc, spreadsheet, or PDF as markdown to feed into other tools - Use `scrape` instead when the source is a URL ## Quick start ```bash -# Basic — PDF/DOCX/etc. to markdown +# Any file → clean markdown firecrawl parse ./paper.pdf -o .firecrawl/paper.md -# Summary shortcut (AI-generated overview) +# AI summary firecrawl parse ./paper.pdf -S -o .firecrawl/summary.md # Ask a question about the doc firecrawl parse ./paper.pdf -Q "What are the main conclusions?" - -# Multiple formats → JSON bundle (markdown + links + summary + metadata) -firecrawl parse ./paper.pdf -f markdown,links,summary --pretty -o .firecrawl/paper.json - -# Raw HTML output -firecrawl parse ./paper.pdf -H -o .firecrawl/paper.html ``` +That covers almost every case. The rest below is for when you need more. + ## Options -| Option | Description | -| ------------------------ | --------------------------------------------------------------------------------------------------- | -| `-f, --format ` | Output formats (comma-separated): markdown, html, rawHtml, links, images, summary, json, attributes | -| `-S, --summary` | Shortcut for `--format summary` (AI summary) | -| `-H, --html` | Shortcut for `--format html` (raw HTML) | -| `-Q, --query ` | Ask a question about the parsed content | -| `--only-main-content` | Strip boilerplate, main content only | -| `--include-tags ` | Only include these HTML tags | -| `--exclude-tags ` | Exclude these HTML tags | -| `--timeout ` | Timeout for the parse job | -| `-o, --output ` | Output file path (default: stdout) | -| `--json` | Force JSON output | -| `--pretty` | Pretty-print JSON | -| `--timing` | Show request duration | +| Option | Description | +| ---------------------- | ------------------------------------------------ | +| `-S, --summary` | AI-generated summary (shortcut for `-f summary`) | +| `-Q, --query ` | Ask a question about the parsed content | +| `-o, --output ` | Output file path (default: stdout) | +| `--only-main-content` | Strip boilerplate | +| `--timing` | Show request duration | + +## Other formats + +Default output is markdown. Pass `-f` to request alternates or bundles: + +```bash +firecrawl parse ./paper.pdf -f html -o paper.html # cleaned HTML +firecrawl parse ./page.html -f markdown,links,images \ # JSON bundle + --pretty -o page.json +``` + +- `markdown` (default), `html`, `rawHtml`, `summary` — work on every file type +- `links`, `images` — work on HTML input; **return empty arrays for PDF/DOCX** (those formats don't carry link/image structure) +- Multiple formats → JSON output keyed by format name +- For structured/schema-based extraction, use `firecrawl agent` instead ## Tips -- **Scrape vs parse**: `scrape` takes a URL, `parse` takes a local file path. A remote PDF URL can still go through `scrape`. -- **Single vs multi format**: one `--format` value returns raw content; multiple return JSON with keys for each format. +- **Scrape vs parse**: `scrape` takes a URL, `parse` takes a local file path. - **Quote paths with spaces**: `firecrawl parse "./My Doc.pdf"`. -- **PDFs may return empty `links`/`images`** — PDF structure doesn't always carry link/image metadata like HTML does. That's expected, not a failure. -- **Large docs**: parse time scales with file size. A ~50-page PDF takes ~10s. Use `--timing` to check. -- **Query vs save-and-grep**: `-Q` is convenient for single questions. For deeper analysis, save to file first, then `grep`/read the markdown. +- **Credits scale with PDF pages**: ~1 credit per page. HTML is typically 1 credit flat. +- **Parse time**: ~10s for a 50-page PDF. Use `--timing` to measure. +- **Query vs save-and-grep**: `-Q` is great for a single question. For deeper analysis, save to markdown first, then `grep` or read the file. ## See also - [firecrawl-scrape](../firecrawl-scrape/SKILL.md) — same idea but for URLs +- [firecrawl-agent](../firecrawl-agent/SKILL.md) — structured data extraction with a schema - [firecrawl-download](../firecrawl-download/SKILL.md) — bulk save a site as local files (which you can then parse) From 8985ddce01c44b9b30949004b2159260c829c76a Mon Sep 17 00:00:00 2001 From: Developers Digest <124798203+developersdigest@users.noreply.github.com> Date: Wed, 22 Apr 2026 09:34:08 -0400 Subject: [PATCH 04/10] bias parse skill toward save-to-file, drop weak formats default pattern: parse to .firecrawl/ then grep/read. removed links/images/rawhtml from skill - kept markdown/html/summary (the formats that work cleanly on all supported file types). --- skills/firecrawl-parse/SKILL.md | 59 +++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/skills/firecrawl-parse/SKILL.md b/skills/firecrawl-parse/SKILL.md index c3f78568d..c876e86b6 100644 --- a/skills/firecrawl-parse/SKILL.md +++ b/skills/firecrawl-parse/SKILL.md @@ -1,7 +1,7 @@ --- name: firecrawl-parse description: | - Convert a local file (PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, HTML) into clean markdown. Use this skill when the user points at a file on disk and wants its content — says "parse this PDF", "convert this Word doc", "read this file", "extract text from", "PDF to markdown", "DOCX to markdown", or provides a local path (not a URL). Also supports AI summary and Q&A. Use this instead of `scrape` for anything on the local filesystem. + Convert a local file (PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, HTML) into clean markdown saved to disk. Use this skill when the user points at a file on disk and wants its content — says "parse this PDF", "convert this Word doc", "read this file", "extract text from", "PDF to markdown", "DOCX to markdown", or provides a local path (not a URL). Also supports AI summary and Q&A. Use this instead of `scrape` for anything on the local filesystem. allowed-tools: - Bash(firecrawl *) - Bash(npx firecrawl *) @@ -9,7 +9,7 @@ allowed-tools: # firecrawl parse -Turn any local document into clean markdown. Supported file types: **PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, HTML**. +Turn any local document into clean markdown on disk. Supported file types: **PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, HTML**. ## When to use @@ -18,17 +18,39 @@ Turn any local document into clean markdown. Supported file types: **PDF, DOCX, - You need a Word doc, spreadsheet, or PDF as markdown to feed into other tools - Use `scrape` instead when the source is a URL +## Always save to a file + +**Default pattern**: parse to a file in `.firecrawl/`, then read or `grep` it. Don't stream full parsed content into the conversation — parsed docs can be hundreds of KB and blow up context windows. + +```bash +mkdir -p .firecrawl + +# Always use -o. Name the output after the source file. +firecrawl parse ./paper.pdf -o .firecrawl/paper.md +``` + +Add `.firecrawl/` to `.gitignore`. + +After parsing, work with the file incrementally: + +```bash +wc -l .firecrawl/paper.md # size check first +head -50 .firecrawl/paper.md # preview +grep -n "conclusion" .firecrawl/paper.md # targeted lookup +``` + ## Quick start ```bash -# Any file → clean markdown +# File → markdown on disk firecrawl parse ./paper.pdf -o .firecrawl/paper.md -# AI summary -firecrawl parse ./paper.pdf -S -o .firecrawl/summary.md +# AI summary to its own file +firecrawl parse ./paper.pdf -S -o .firecrawl/paper-summary.md -# Ask a question about the doc -firecrawl parse ./paper.pdf -Q "What are the main conclusions?" +# Q&A — small answers are okay in stdout, but save if you might reuse +firecrawl parse ./paper.pdf -Q "What are the main conclusions?" \ + -o .firecrawl/paper-conclusions.md ``` That covers almost every case. The rest below is for when you need more. @@ -39,7 +61,7 @@ That covers almost every case. The rest below is for when you need more. | ---------------------- | ------------------------------------------------ | | `-S, --summary` | AI-generated summary (shortcut for `-f summary`) | | `-Q, --query ` | Ask a question about the parsed content | -| `-o, --output ` | Output file path (default: stdout) | +| `-o, --output ` | Output file path — **always use this** | | `--only-main-content` | Strip boilerplate | | `--timing` | Show request duration | @@ -48,23 +70,26 @@ That covers almost every case. The rest below is for when you need more. Default output is markdown. Pass `-f` to request alternates or bundles: ```bash -firecrawl parse ./paper.pdf -f html -o paper.html # cleaned HTML -firecrawl parse ./page.html -f markdown,links,images \ # JSON bundle - --pretty -o page.json +# Cleaned HTML instead of markdown +firecrawl parse ./paper.pdf -f html -o .firecrawl/paper.html + +# Markdown + summary together (JSON bundle) +firecrawl parse ./paper.pdf -f markdown,summary --pretty \ + -o .firecrawl/paper-bundle.json ``` -- `markdown` (default), `html`, `rawHtml`, `summary` — work on every file type -- `links`, `images` — work on HTML input; **return empty arrays for PDF/DOCX** (those formats don't carry link/image structure) -- Multiple formats → JSON output keyed by format name -- For structured/schema-based extraction, use `firecrawl agent` instead +Formats: `markdown` (default), `html`, `summary`. Multiple formats → JSON output keyed by format name. + +For structured/schema-based extraction, use `firecrawl agent` instead. ## Tips - **Scrape vs parse**: `scrape` takes a URL, `parse` takes a local file path. -- **Quote paths with spaces**: `firecrawl parse "./My Doc.pdf"`. +- **Quote paths with spaces**: `firecrawl parse "./My Doc.pdf" -o .firecrawl/mydoc.md`. - **Credits scale with PDF pages**: ~1 credit per page. HTML is typically 1 credit flat. - **Parse time**: ~10s for a 50-page PDF. Use `--timing` to measure. -- **Query vs save-and-grep**: `-Q` is great for a single question. For deeper analysis, save to markdown first, then `grep` or read the file. +- **Naming convention**: `.firecrawl/{source-basename}.md` — keeps outputs easy to find and re-use. +- **Avoid redundant parses**: check `.firecrawl/` before re-parsing the same file. ## See also From 0101fc2e686c72678241d53badfb874bafe598b8 Mon Sep 17 00:00:00 2001 From: Developers Digest <124798203+developersdigest@users.noreply.github.com> Date: Wed, 22 Apr 2026 09:34:38 -0400 Subject: [PATCH 05/10] trim parse skill: less is more cut to essentials. match density of firecrawl-map. dropped core-capability section, other-formats section, verbose tips. --- skills/firecrawl-parse/SKILL.md | 82 +++++++++------------------------ 1 file changed, 22 insertions(+), 60 deletions(-) diff --git a/skills/firecrawl-parse/SKILL.md b/skills/firecrawl-parse/SKILL.md index c876e86b6..fa5ec6fc3 100644 --- a/skills/firecrawl-parse/SKILL.md +++ b/skills/firecrawl-parse/SKILL.md @@ -1,7 +1,7 @@ --- name: firecrawl-parse description: | - Convert a local file (PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, HTML) into clean markdown saved to disk. Use this skill when the user points at a file on disk and wants its content — says "parse this PDF", "convert this Word doc", "read this file", "extract text from", "PDF to markdown", "DOCX to markdown", or provides a local path (not a URL). Also supports AI summary and Q&A. Use this instead of `scrape` for anything on the local filesystem. + Convert a local file (PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, HTML) into clean markdown saved to disk. Use this skill when the user points at a file on disk and wants its content — says "parse this PDF", "convert this Word doc", "read this file", "extract text from", "PDF to markdown", or provides a local path (not a URL). Also supports AI summary and Q&A. Use this instead of `scrape` for local files. allowed-tools: - Bash(firecrawl *) - Bash(npx firecrawl *) @@ -9,90 +9,52 @@ allowed-tools: # firecrawl parse -Turn any local document into clean markdown on disk. Supported file types: **PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, HTML**. +Turn a local document into clean markdown on disk. Supports **PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, HTML**. ## When to use - You have a file on disk (not a URL) and want its text as markdown - User drops a PDF/DOCX and asks what it says, or to summarize it -- You need a Word doc, spreadsheet, or PDF as markdown to feed into other tools - Use `scrape` instead when the source is a URL -## Always save to a file +## Quick start -**Default pattern**: parse to a file in `.firecrawl/`, then read or `grep` it. Don't stream full parsed content into the conversation — parsed docs can be hundreds of KB and blow up context windows. +Always save to `.firecrawl/` with `-o` — parsed docs can be hundreds of KB and blow up context if streamed to stdout. Add `.firecrawl/` to `.gitignore`. ```bash mkdir -p .firecrawl -# Always use -o. Name the output after the source file. -firecrawl parse ./paper.pdf -o .firecrawl/paper.md -``` - -Add `.firecrawl/` to `.gitignore`. - -After parsing, work with the file incrementally: - -```bash -wc -l .firecrawl/paper.md # size check first -head -50 .firecrawl/paper.md # preview -grep -n "conclusion" .firecrawl/paper.md # targeted lookup -``` - -## Quick start - -```bash -# File → markdown on disk +# File → markdown firecrawl parse ./paper.pdf -o .firecrawl/paper.md -# AI summary to its own file +# AI summary firecrawl parse ./paper.pdf -S -o .firecrawl/paper-summary.md -# Q&A — small answers are okay in stdout, but save if you might reuse +# Ask a question about the doc firecrawl parse ./paper.pdf -Q "What are the main conclusions?" \ - -o .firecrawl/paper-conclusions.md + -o .firecrawl/paper-qa.md ``` -That covers almost every case. The rest below is for when you need more. +Then `head`, `grep`, or incrementally read the file — don't load the whole thing at once. ## Options -| Option | Description | -| ---------------------- | ------------------------------------------------ | -| `-S, --summary` | AI-generated summary (shortcut for `-f summary`) | -| `-Q, --query ` | Ask a question about the parsed content | -| `-o, --output ` | Output file path — **always use this** | -| `--only-main-content` | Strip boilerplate | -| `--timing` | Show request duration | - -## Other formats - -Default output is markdown. Pass `-f` to request alternates or bundles: - -```bash -# Cleaned HTML instead of markdown -firecrawl parse ./paper.pdf -f html -o .firecrawl/paper.html - -# Markdown + summary together (JSON bundle) -firecrawl parse ./paper.pdf -f markdown,summary --pretty \ - -o .firecrawl/paper-bundle.json -``` - -Formats: `markdown` (default), `html`, `summary`. Multiple formats → JSON output keyed by format name. - -For structured/schema-based extraction, use `firecrawl agent` instead. +| Option | Description | +| ---------------------- | --------------------------------------- | +| `-S, --summary` | AI-generated summary | +| `-Q, --query ` | Ask a question about the parsed content | +| `-o, --output ` | Output file path — **always use this** | +| `-f, --format ` | `markdown` (default), `html`, `summary` | +| `--only-main-content` | Strip boilerplate | +| `--timing` | Show request duration | ## Tips -- **Scrape vs parse**: `scrape` takes a URL, `parse` takes a local file path. -- **Quote paths with spaces**: `firecrawl parse "./My Doc.pdf" -o .firecrawl/mydoc.md`. -- **Credits scale with PDF pages**: ~1 credit per page. HTML is typically 1 credit flat. -- **Parse time**: ~10s for a 50-page PDF. Use `--timing` to measure. -- **Naming convention**: `.firecrawl/{source-basename}.md` — keeps outputs easy to find and re-use. -- **Avoid redundant parses**: check `.firecrawl/` before re-parsing the same file. +- Quote paths with spaces: `firecrawl parse "./My Doc.pdf" -o .firecrawl/mydoc.md`. +- Credits: ~1 per PDF page; HTML is 1 flat. +- Check `.firecrawl/` before re-parsing the same file. ## See also -- [firecrawl-scrape](../firecrawl-scrape/SKILL.md) — same idea but for URLs -- [firecrawl-agent](../firecrawl-agent/SKILL.md) — structured data extraction with a schema -- [firecrawl-download](../firecrawl-download/SKILL.md) — bulk save a site as local files (which you can then parse) +- [firecrawl-scrape](../firecrawl-scrape/SKILL.md) — same idea for URLs +- [firecrawl-agent](../firecrawl-agent/SKILL.md) — structured extraction with a schema From 33d6b3ba6493bfbfa2325b5c966452561cb1b876 Mon Sep 17 00:00:00 2001 From: Developers Digest <124798203+developersdigest@users.noreply.github.com> Date: Wed, 22 Apr 2026 09:37:44 -0400 Subject: [PATCH 06/10] align parse skill with /v2/parse api ground truth add .htm/.xhtml to supported list, add --timeout option. mirrors ParseFile/ParseOptions from firecrawl core (commit c7993d7). --- skills/firecrawl-parse/SKILL.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/skills/firecrawl-parse/SKILL.md b/skills/firecrawl-parse/SKILL.md index fa5ec6fc3..6437782ff 100644 --- a/skills/firecrawl-parse/SKILL.md +++ b/skills/firecrawl-parse/SKILL.md @@ -9,7 +9,7 @@ allowed-tools: # firecrawl parse -Turn a local document into clean markdown on disk. Supports **PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, HTML**. +Turn a local document into clean markdown on disk. Supports **PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, HTML/HTM/XHTML**. ## When to use @@ -46,6 +46,7 @@ Then `head`, `grep`, or incrementally read the file — don't load the whole thi | `-o, --output ` | Output file path — **always use this** | | `-f, --format ` | `markdown` (default), `html`, `summary` | | `--only-main-content` | Strip boilerplate | +| `--timeout ` | Timeout for the parse job | | `--timing` | Show request duration | ## Tips From 58df65673541d5a8f7c3b15334468d1a2403b402 Mon Sep 17 00:00:00 2001 From: Developers Digest <124798203+developersdigest@users.noreply.github.com> Date: Wed, 22 Apr 2026 09:38:02 -0400 Subject: [PATCH 07/10] drop --only-main-content from parse skill option exists on api but isn't meaningful for pdf/docx/xlsx - no nav/footer/sidebar chrome to strip on local documents. --- skills/firecrawl-parse/SKILL.md | 1 - 1 file changed, 1 deletion(-) diff --git a/skills/firecrawl-parse/SKILL.md b/skills/firecrawl-parse/SKILL.md index 6437782ff..049cda24f 100644 --- a/skills/firecrawl-parse/SKILL.md +++ b/skills/firecrawl-parse/SKILL.md @@ -45,7 +45,6 @@ Then `head`, `grep`, or incrementally read the file — don't load the whole thi | `-Q, --query ` | Ask a question about the parsed content | | `-o, --output ` | Output file path — **always use this** | | `-f, --format ` | `markdown` (default), `html`, `summary` | -| `--only-main-content` | Strip boilerplate | | `--timeout ` | Timeout for the parse job | | `--timing` | Show request duration | From 1cbf76344d67b3d6ed2f6d52926758d44aeee51e Mon Sep 17 00:00:00 2001 From: Developers Digest <124798203+developersdigest@users.noreply.github.com> Date: Wed, 22 Apr 2026 09:45:44 -0400 Subject: [PATCH 08/10] refine firecrawl-parse skill --- skills/firecrawl-parse/SKILL.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skills/firecrawl-parse/SKILL.md b/skills/firecrawl-parse/SKILL.md index 049cda24f..d6322b04f 100644 --- a/skills/firecrawl-parse/SKILL.md +++ b/skills/firecrawl-parse/SKILL.md @@ -1,7 +1,7 @@ --- name: firecrawl-parse description: | - Convert a local file (PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, HTML) into clean markdown saved to disk. Use this skill when the user points at a file on disk and wants its content — says "parse this PDF", "convert this Word doc", "read this file", "extract text from", "PDF to markdown", or provides a local path (not a URL). Also supports AI summary and Q&A. Use this instead of `scrape` for local files. + Efficiently extract and convert the contents of any local file—such as PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, or HTML—into clean, well-formatted markdown saved to disk. Use this skill whenever the user requests to parse, read, or extract information from a file on their computer, including phrases like “parse this PDF”, “convert this document”, “read this file”, “extract text from”, or when a local file path (not a URL) is provided. This skill offers advanced options like generating AI-powered summaries and answering questions based on the file's content. Prefer this tool over `scrape` when handling local files to deliver precise, structured outputs for downstream tasks. allowed-tools: - Bash(firecrawl *) - Bash(npx firecrawl *) @@ -35,7 +35,7 @@ firecrawl parse ./paper.pdf -Q "What are the main conclusions?" \ -o .firecrawl/paper-qa.md ``` -Then `head`, `grep`, or incrementally read the file — don't load the whole thing at once. +Then `head`, `grep`, `rg` etc., or incrementally read the file - don't load the whole thing at once. ## Options @@ -53,8 +53,8 @@ Then `head`, `grep`, or incrementally read the file — don't load the whole thi - Quote paths with spaces: `firecrawl parse "./My Doc.pdf" -o .firecrawl/mydoc.md`. - Credits: ~1 per PDF page; HTML is 1 flat. - Check `.firecrawl/` before re-parsing the same file. +- To check your credit balance (recommended for batch processing and similar workflows), use the `firecrawl credit-usage` command. ## See also - [firecrawl-scrape](../firecrawl-scrape/SKILL.md) — same idea for URLs -- [firecrawl-agent](../firecrawl-agent/SKILL.md) — structured extraction with a schema From 1f8add95eb7bcae210238ef52c11adfef69f6b52 Mon Sep 17 00:00:00 2001 From: Developers Digest <124798203+developersdigest@users.noreply.github.com> Date: Wed, 22 Apr 2026 09:47:17 -0400 Subject: [PATCH 09/10] note 50 mb upload limit in parse skill --- skills/firecrawl-parse/SKILL.md | 1 + 1 file changed, 1 insertion(+) diff --git a/skills/firecrawl-parse/SKILL.md b/skills/firecrawl-parse/SKILL.md index d6322b04f..f350cb2a9 100644 --- a/skills/firecrawl-parse/SKILL.md +++ b/skills/firecrawl-parse/SKILL.md @@ -51,6 +51,7 @@ Then `head`, `grep`, `rg` etc., or incrementally read the file - don't load the ## Tips - Quote paths with spaces: `firecrawl parse "./My Doc.pdf" -o .firecrawl/mydoc.md`. +- Max upload size: **50 MB** per file. - Credits: ~1 per PDF page; HTML is 1 flat. - Check `.firecrawl/` before re-parsing the same file. - To check your credit balance (recommended for batch processing and similar workflows), use the `firecrawl credit-usage` command. From aaca75fd1b76fee165f1b0c7df7506c8234d43fc Mon Sep 17 00:00:00 2001 From: Developers Digest <124798203+developersdigest@users.noreply.github.com> Date: Wed, 22 Apr 2026 11:09:17 -0400 Subject: [PATCH 10/10] bump version to 1.16.0 --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 27ab11144..fcc50c6a7 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "firecrawl-cli", - "version": "1.15.2", + "version": "1.16.0", "description": "Command-line interface for Firecrawl. Scrape, crawl, and extract data from any website directly from your terminal.", "main": "dist/index.js", "bin": {