diff --git a/package.json b/package.json index 27ab11144..fcc50c6a7 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "firecrawl-cli", - "version": "1.15.2", + "version": "1.16.0", "description": "Command-line interface for Firecrawl. Scrape, crawl, and extract data from any website directly from your terminal.", "main": "dist/index.js", "bin": { diff --git a/skills/firecrawl-cli/SKILL.md b/skills/firecrawl-cli/SKILL.md index 4664c6207..8d7e6a7c7 100644 --- a/skills/firecrawl-cli/SKILL.md +++ b/skills/firecrawl-cli/SKILL.md @@ -62,6 +62,7 @@ Follow this escalation pattern: | AI-powered data extraction | `agent` | Need structured data from complex sites | | Interact with a page | `scrape` + `interact` | Content requires clicks, form fills, pagination, or login | | Download a site to files | `download` | Save an entire site as local files | +| Parse a local file | `parse` | File on disk (PDF, DOCX, XLSX, etc.) — not a URL | For detailed command reference, run `firecrawl --help`. @@ -85,6 +86,7 @@ For detailed command reference, run `firecrawl --help`. - **AI-powered structured extraction from complex sites** -> [firecrawl-agent](../firecrawl-agent/SKILL.md) - **Clicks, forms, login, pagination, or post-scrape browser actions** -> [firecrawl-interact](../firecrawl-interact/SKILL.md) - **Downloading a site to local files** -> [firecrawl-download](../firecrawl-download/SKILL.md) +- **Parsing a local file (PDF, DOCX, XLSX, HTML, etc.)** -> [firecrawl-parse](../firecrawl-parse/SKILL.md) - **Install, auth, or setup problems** -> [rules/install.md](rules/install.md) - **Output handling and safe file-reading patterns** -> [rules/security.md](rules/security.md) - **Integrating Firecrawl into an app, adding `FIRECRAWL_API_KEY` to `.env`, or choosing endpoint usage in product code** -> use the `firecrawl-build` skills (already installed alongside this CLI skill) diff --git a/skills/firecrawl-parse/SKILL.md b/skills/firecrawl-parse/SKILL.md new file mode 100644 index 000000000..f350cb2a9 --- /dev/null +++ b/skills/firecrawl-parse/SKILL.md @@ -0,0 +1,61 @@ +--- +name: firecrawl-parse +description: | + Efficiently extract and convert the contents of any local file—such as PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, or HTML—into clean, well-formatted markdown saved to disk. Use this skill whenever the user requests to parse, read, or extract information from a file on their computer, including phrases like “parse this PDF”, “convert this document”, “read this file”, “extract text from”, or when a local file path (not a URL) is provided. This skill offers advanced options like generating AI-powered summaries and answering questions based on the file's content. Prefer this tool over `scrape` when handling local files to deliver precise, structured outputs for downstream tasks. +allowed-tools: + - Bash(firecrawl *) + - Bash(npx firecrawl *) +--- + +# firecrawl parse + +Turn a local document into clean markdown on disk. Supports **PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, HTML/HTM/XHTML**. + +## When to use + +- You have a file on disk (not a URL) and want its text as markdown +- User drops a PDF/DOCX and asks what it says, or to summarize it +- Use `scrape` instead when the source is a URL + +## Quick start + +Always save to `.firecrawl/` with `-o` — parsed docs can be hundreds of KB and blow up context if streamed to stdout. Add `.firecrawl/` to `.gitignore`. + +```bash +mkdir -p .firecrawl + +# File → markdown +firecrawl parse ./paper.pdf -o .firecrawl/paper.md + +# AI summary +firecrawl parse ./paper.pdf -S -o .firecrawl/paper-summary.md + +# Ask a question about the doc +firecrawl parse ./paper.pdf -Q "What are the main conclusions?" \ + -o .firecrawl/paper-qa.md +``` + +Then `head`, `grep`, `rg` etc., or incrementally read the file - don't load the whole thing at once. + +## Options + +| Option | Description | +| ---------------------- | --------------------------------------- | +| `-S, --summary` | AI-generated summary | +| `-Q, --query ` | Ask a question about the parsed content | +| `-o, --output ` | Output file path — **always use this** | +| `-f, --format ` | `markdown` (default), `html`, `summary` | +| `--timeout ` | Timeout for the parse job | +| `--timing` | Show request duration | + +## Tips + +- Quote paths with spaces: `firecrawl parse "./My Doc.pdf" -o .firecrawl/mydoc.md`. +- Max upload size: **50 MB** per file. +- Credits: ~1 per PDF page; HTML is 1 flat. +- Check `.firecrawl/` before re-parsing the same file. +- To check your credit balance (recommended for batch processing and similar workflows), use the `firecrawl credit-usage` command. + +## See also + +- [firecrawl-scrape](../firecrawl-scrape/SKILL.md) — same idea for URLs diff --git a/src/commands/parse.ts b/src/commands/parse.ts new file mode 100644 index 000000000..1fd481fc9 --- /dev/null +++ b/src/commands/parse.ts @@ -0,0 +1,239 @@ +/** + * Parse command implementation + * + * Uploads a local file to the Firecrawl /v2/parse endpoint and returns the + * parsed document in the requested format(s). Supported file types: + * .html, .htm, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import type { FormatOption } from '@mendable/firecrawl-js'; +import type { ParseOptions, ParseResult } from '../types/parse'; +import type { ScrapeFormat } from '../types/scrape'; +import { getClient } from '../utils/client'; +import { getConfig, validateConfig } from '../utils/config'; +import { handleScrapeOutput } from '../utils/output'; + +const DEFAULT_API_URL = 'https://api.firecrawl.dev'; + +/** File extensions accepted by /v2/parse (mirrors the API controller). */ +const SUPPORTED_EXTENSIONS = new Set([ + '.html', + '.htm', + '.pdf', + '.docx', + '.doc', + '.odt', + '.rtf', + '.xlsx', + '.xls', +]); + +/** + * Best-effort content-type lookup so the API's kind detector has a hint + * even if the extension is ambiguous. + */ +const CONTENT_TYPE_BY_EXT: Record = { + '.html': 'text/html', + '.htm': 'text/html', + '.pdf': 'application/pdf', + '.docx': + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + '.doc': 'application/msword', + '.odt': 'application/vnd.oasis.opendocument.text', + '.rtf': 'application/rtf', + '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + '.xls': 'application/vnd.ms-excel', +}; + +function outputTiming( + options: ParseOptions, + requestStartTime: number, + requestEndTime: number, + error?: Error | unknown +): void { + if (!options.timing) return; + + const duration = requestEndTime - requestStartTime; + const info: Record = { + file: options.file, + requestTime: new Date(requestStartTime).toISOString(), + duration: `${duration}ms`, + status: error ? 'error' : 'success', + }; + if (error) { + info.error = error instanceof Error ? error.message : 'Unknown error'; + } + console.error('Timing:', JSON.stringify(info, null, 2)); +} + +/** + * Build the `formats` array sent to the API (mirrors scrape's behavior). + */ +function buildFormats(options: ParseOptions): FormatOption[] { + const formats: FormatOption[] = []; + + if (options.formats && options.formats.length > 0) { + formats.push(...options.formats); + } + + if (options.query) { + formats.push({ type: 'query', prompt: options.query } as any); + } + + if (formats.length === 0) { + formats.push('markdown'); + } + + return formats; +} + +/** + * Build the JSON `options` payload uploaded alongside the file. + */ +function buildOptionsPayload(options: ParseOptions): Record { + const payload: Record = { + formats: buildFormats(options), + integration: 'cli', + }; + + if (options.onlyMainContent !== undefined) { + payload.onlyMainContent = options.onlyMainContent; + } + if (options.includeTags && options.includeTags.length > 0) { + payload.includeTags = options.includeTags; + } + if (options.excludeTags && options.excludeTags.length > 0) { + payload.excludeTags = options.excludeTags; + } + if (options.timeout !== undefined) { + payload.timeout = options.timeout; + } + if (options.location) { + payload.location = options.location; + } + + return payload; +} + +/** + * Execute the parse command by POSTing a multipart upload to /v2/parse. + */ +export async function executeParse( + options: ParseOptions +): Promise { + const filePath = path.resolve(options.file); + + if (!fs.existsSync(filePath)) { + return { + success: false, + error: `File not found: ${options.file}`, + }; + } + + const stat = fs.statSync(filePath); + if (!stat.isFile()) { + return { + success: false, + error: `Not a file: ${options.file}`, + }; + } + + const ext = path.extname(filePath).toLowerCase(); + if (!SUPPORTED_EXTENSIONS.has(ext)) { + return { + success: false, + error: + `Unsupported file type "${ext || '(none)'}". ` + + `Supported extensions: ${[...SUPPORTED_EXTENSIONS].join(', ')}`, + }; + } + + // Ensure auth/url is resolved through the same config pipeline scrape uses. + if (options.apiKey || options.apiUrl) { + getClient({ apiKey: options.apiKey, apiUrl: options.apiUrl }); + } + + const config = getConfig(); + const apiKey = options.apiKey || config.apiKey; + validateConfig(apiKey); + + const apiUrl = (options.apiUrl || config.apiUrl || DEFAULT_API_URL).replace( + /\/$/, + '' + ); + + const buffer = fs.readFileSync(filePath); + const filename = path.basename(filePath); + const contentType = CONTENT_TYPE_BY_EXT[ext] ?? 'application/octet-stream'; + + const form = new FormData(); + form.append( + 'file', + new Blob([new Uint8Array(buffer)], { type: contentType }), + filename + ); + form.append('options', JSON.stringify(buildOptionsPayload(options))); + + const requestStartTime = Date.now(); + + try { + const response = await fetch(`${apiUrl}/v2/parse`, { + method: 'POST', + headers: apiKey ? { Authorization: `Bearer ${apiKey}` } : undefined, + body: form, + }); + + const requestEndTime = Date.now(); + outputTiming(options, requestStartTime, requestEndTime); + + const payload = (await response.json().catch(() => ({}))) as any; + + if (!response.ok || payload?.success === false) { + const message = + payload?.error || + `HTTP ${response.status}: ${response.statusText || 'Request failed'}`; + return { success: false, error: message }; + } + + return { + success: true, + data: payload?.data ?? payload, + }; + } catch (error) { + const requestEndTime = Date.now(); + outputTiming(options, requestStartTime, requestEndTime, error); + return { + success: false, + error: error instanceof Error ? error.message : 'Unknown error occurred', + }; + } +} + +/** + * Handle parse command output. Reuses the scrape output formatter since the + * /v2/parse response shape matches /v2/scrape. + */ +export async function handleParseCommand(options: ParseOptions): Promise { + const result = await executeParse(options); + + if (options.query && result.success && result.data?.answer) { + const { writeOutput } = await import('../utils/output'); + writeOutput(result.data.answer, options.output, !!options.output); + return; + } + + const effectiveFormats: ScrapeFormat[] = + options.formats && options.formats.length > 0 + ? [...options.formats] + : ['markdown']; + + handleScrapeOutput( + result, + effectiveFormats, + options.output, + options.pretty, + options.json + ); +} diff --git a/src/index.ts b/src/index.ts index 1faec0138..87076889b 100644 --- a/src/index.ts +++ b/src/index.ts @@ -16,6 +16,7 @@ import { configure, viewConfig } from './commands/config'; import { handleCreditUsageCommand } from './commands/credit-usage'; import { handleCrawlCommand } from './commands/crawl'; import { handleMapCommand } from './commands/map'; +import { handleParseCommand } from './commands/parse'; import { handleSearchCommand } from './commands/search'; import { handleAgentCommand } from './commands/agent'; import { @@ -61,6 +62,7 @@ const AUTH_REQUIRED_COMMANDS = [ 'download', 'crawl', 'map', + 'parse', 'search', 'agent', 'browser', @@ -498,6 +500,97 @@ function createMapCommand(): Command { return mapCmd; } +/** + * Create and configure the parse command + */ +function createParseCommand(): Command { + const parseCmd = new Command('parse') + .description( + 'Parse a local file (HTML, PDF, DOCX, DOC, ODT, RTF, XLSX, XLS) into markdown, HTML, links, JSON, and more. Uses /v2/parse.' + ) + .argument('', 'Path to the local file to parse') + .option('-H, --html', 'Output raw HTML (shortcut for --format html)') + .option( + '-f, --format ', + 'Output format(s). Multiple formats can be specified with commas (e.g., "markdown,links"). Available: markdown, html, rawHtml, links, images, summary, json, attributes. Single format outputs raw content; multiple formats output JSON.' + ) + .option('--only-main-content', 'Include only main content', false) + .option('-S, --summary', 'Output summary (shortcut for --format summary)') + .option('--include-tags ', 'Comma-separated list of tags to include') + .option('--exclude-tags ', 'Comma-separated list of tags to exclude') + .option( + '--timeout ', + 'Timeout in milliseconds for the parse job', + parseInt + ) + .option( + '-Q, --query ', + 'Ask a question about the parsed content (query format)' + ) + .option( + '-k, --api-key ', + 'Firecrawl API key (overrides global --api-key)' + ) + .option('--api-url ', 'API URL (overrides global --api-url)') + .option('-o, --output ', 'Output file path (default: stdout)') + .option('--json', 'Output as JSON format', false) + .option('--pretty', 'Pretty print JSON output', false) + .option( + '--timing', + 'Show request timing and other useful information', + false + ) + .addHelpText( + 'after', + ` +Examples: + $ firecrawl parse ./report.pdf + $ firecrawl parse ./report.pdf -f markdown,links + $ firecrawl parse ./page.html -H + $ firecrawl parse ./contract.docx --only-main-content + $ firecrawl parse ./report.pdf -Q "What is the total revenue?" + $ firecrawl parse ./report.pdf --json --pretty -o report.json + +Supported file types: .html, .htm, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls +Max upload size: 50 MB +` + ) + .action(async (file: string, options) => { + let format: string | undefined; + if (options.html) { + format = 'html'; + } else if (options.summary) { + format = 'summary'; + } else if (options.format) { + format = options.format; + } + + const scrapeOptions = parseScrapeOptions({ + ...options, + url: 'file://' + file, + format: format ?? 'markdown', + }); + + await handleParseCommand({ + file, + formats: scrapeOptions.formats, + onlyMainContent: scrapeOptions.onlyMainContent, + includeTags: scrapeOptions.includeTags, + excludeTags: scrapeOptions.excludeTags, + timeout: options.timeout, + apiKey: options.apiKey, + apiUrl: options.apiUrl, + output: options.output, + pretty: options.pretty, + json: options.json, + timing: options.timing, + query: options.query, + }); + }); + + return parseCmd; +} + /** * Create and configure the search command */ @@ -1185,6 +1278,7 @@ Examples: // Add core commands to main program program.addCommand(createCrawlCommand()); program.addCommand(createMapCommand()); +program.addCommand(createParseCommand()); program.addCommand(createSearchCommand()); program.addCommand(createAgentCommand()); program.addCommand(createInteractCommand()); diff --git a/src/types/parse.ts b/src/types/parse.ts new file mode 100644 index 000000000..7499ef541 --- /dev/null +++ b/src/types/parse.ts @@ -0,0 +1,42 @@ +/** + * Types and interfaces for the parse command + */ + +import type { ScrapeFormat, ScrapeLocation } from './scrape'; + +export interface ParseOptions { + /** Local file path to parse */ + file: string; + /** Output format(s) */ + formats?: ScrapeFormat[]; + /** Include only main content */ + onlyMainContent?: boolean; + /** Include tags */ + includeTags?: string[]; + /** Exclude tags */ + excludeTags?: string[]; + /** Timeout in milliseconds for the parse job */ + timeout?: number; + /** API key for Firecrawl */ + apiKey?: string; + /** API URL for Firecrawl */ + apiUrl?: string; + /** Output file path */ + output?: string; + /** Pretty print JSON output */ + pretty?: boolean; + /** Force JSON output */ + json?: boolean; + /** Show request timing */ + timing?: boolean; + /** Location for geo-targeted parsing (typically unused for local files) */ + location?: ScrapeLocation; + /** Ask a question about the parsed content (query format) */ + query?: string; +} + +export interface ParseResult { + success: boolean; + data?: any; + error?: string; +}