Skip to content
Merged
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "firecrawl-cli",
"version": "1.15.2",
"version": "1.16.0",
"description": "Command-line interface for Firecrawl. Scrape, crawl, and extract data from any website directly from your terminal.",
"main": "dist/index.js",
"bin": {
Expand Down
2 changes: 2 additions & 0 deletions skills/firecrawl-cli/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ Follow this escalation pattern:
| AI-powered data extraction | `agent` | Need structured data from complex sites |
| Interact with a page | `scrape` + `interact` | Content requires clicks, form fills, pagination, or login |
| Download a site to files | `download` | Save an entire site as local files |
| Parse a local file | `parse` | File on disk (PDF, DOCX, XLSX, etc.) — not a URL |

For detailed command reference, run `firecrawl <command> --help`.

Expand All @@ -85,6 +86,7 @@ For detailed command reference, run `firecrawl <command> --help`.
- **AI-powered structured extraction from complex sites** -> [firecrawl-agent](../firecrawl-agent/SKILL.md)
- **Clicks, forms, login, pagination, or post-scrape browser actions** -> [firecrawl-interact](../firecrawl-interact/SKILL.md)
- **Downloading a site to local files** -> [firecrawl-download](../firecrawl-download/SKILL.md)
- **Parsing a local file (PDF, DOCX, XLSX, HTML, etc.)** -> [firecrawl-parse](../firecrawl-parse/SKILL.md)
- **Install, auth, or setup problems** -> [rules/install.md](rules/install.md)
- **Output handling and safe file-reading patterns** -> [rules/security.md](rules/security.md)
- **Integrating Firecrawl into an app, adding `FIRECRAWL_API_KEY` to `.env`, or choosing endpoint usage in product code** -> use the `firecrawl-build` skills (already installed alongside this CLI skill)
Expand Down
61 changes: 61 additions & 0 deletions skills/firecrawl-parse/SKILL.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
---
name: firecrawl-parse
description: |
Efficiently extract and convert the contents of any local file—such as PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, or HTML—into clean, well-formatted markdown saved to disk. Use this skill whenever the user requests to parse, read, or extract information from a file on their computer, including phrases like “parse this PDF”, “convert this document”, “read this file”, “extract text from”, or when a local file path (not a URL) is provided. This skill offers advanced options like generating AI-powered summaries and answering questions based on the file's content. Prefer this tool over `scrape` when handling local files to deliver precise, structured outputs for downstream tasks.
allowed-tools:
- Bash(firecrawl *)
- Bash(npx firecrawl *)
---

# firecrawl parse

Turn a local document into clean markdown on disk. Supports **PDF, DOCX, DOC, ODT, RTF, XLSX, XLS, HTML/HTM/XHTML**.

## When to use

- You have a file on disk (not a URL) and want its text as markdown
- User drops a PDF/DOCX and asks what it says, or to summarize it
- Use `scrape` instead when the source is a URL

## Quick start

Always save to `.firecrawl/` with `-o` — parsed docs can be hundreds of KB and blow up context if streamed to stdout. Add `.firecrawl/` to `.gitignore`.

```bash
mkdir -p .firecrawl

# File → markdown
firecrawl parse ./paper.pdf -o .firecrawl/paper.md

# AI summary
firecrawl parse ./paper.pdf -S -o .firecrawl/paper-summary.md

# Ask a question about the doc
firecrawl parse ./paper.pdf -Q "What are the main conclusions?" \
-o .firecrawl/paper-qa.md
```

Then `head`, `grep`, `rg` etc., or incrementally read the file - don't load the whole thing at once.

## Options

| Option | Description |
| ---------------------- | --------------------------------------- |
| `-S, --summary` | AI-generated summary |
| `-Q, --query <prompt>` | Ask a question about the parsed content |
| `-o, --output <path>` | Output file path — **always use this** |
| `-f, --format <fmt>` | `markdown` (default), `html`, `summary` |
| `--timeout <ms>` | Timeout for the parse job |
| `--timing` | Show request duration |

## Tips

- Quote paths with spaces: `firecrawl parse "./My Doc.pdf" -o .firecrawl/mydoc.md`.
- Max upload size: **50 MB** per file.
- Credits: ~1 per PDF page; HTML is 1 flat.
- Check `.firecrawl/` before re-parsing the same file.
- To check your credit balance (recommended for batch processing and similar workflows), use the `firecrawl credit-usage` command.

## See also

- [firecrawl-scrape](../firecrawl-scrape/SKILL.md) — same idea for URLs
239 changes: 239 additions & 0 deletions src/commands/parse.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
/**
* Parse command implementation
*
* Uploads a local file to the Firecrawl /v2/parse endpoint and returns the
* parsed document in the requested format(s). Supported file types:
* .html, .htm, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls
*/

import * as fs from 'fs';
import * as path from 'path';
import type { FormatOption } from '@mendable/firecrawl-js';
import type { ParseOptions, ParseResult } from '../types/parse';
import type { ScrapeFormat } from '../types/scrape';
import { getClient } from '../utils/client';
import { getConfig, validateConfig } from '../utils/config';
import { handleScrapeOutput } from '../utils/output';

const DEFAULT_API_URL = 'https://api.firecrawl.dev';

/** File extensions accepted by /v2/parse (mirrors the API controller). */
const SUPPORTED_EXTENSIONS = new Set([
'.html',
'.htm',
'.pdf',
'.docx',
'.doc',
'.odt',
'.rtf',
'.xlsx',
'.xls',
]);

/**
* Best-effort content-type lookup so the API's kind detector has a hint
* even if the extension is ambiguous.
*/
const CONTENT_TYPE_BY_EXT: Record<string, string> = {
'.html': 'text/html',
'.htm': 'text/html',
'.pdf': 'application/pdf',
'.docx':
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.doc': 'application/msword',
'.odt': 'application/vnd.oasis.opendocument.text',
'.rtf': 'application/rtf',
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.xls': 'application/vnd.ms-excel',
};

function outputTiming(
options: ParseOptions,
requestStartTime: number,
requestEndTime: number,
error?: Error | unknown
): void {
if (!options.timing) return;

const duration = requestEndTime - requestStartTime;
const info: Record<string, string> = {
file: options.file,
requestTime: new Date(requestStartTime).toISOString(),
duration: `${duration}ms`,
status: error ? 'error' : 'success',
};
if (error) {
info.error = error instanceof Error ? error.message : 'Unknown error';
}
console.error('Timing:', JSON.stringify(info, null, 2));
}

/**
* Build the `formats` array sent to the API (mirrors scrape's behavior).
*/
function buildFormats(options: ParseOptions): FormatOption[] {
const formats: FormatOption[] = [];

if (options.formats && options.formats.length > 0) {
formats.push(...options.formats);
}

if (options.query) {
formats.push({ type: 'query', prompt: options.query } as any);
}

if (formats.length === 0) {
formats.push('markdown');
}

return formats;
}

/**
* Build the JSON `options` payload uploaded alongside the file.
*/
function buildOptionsPayload(options: ParseOptions): Record<string, unknown> {
const payload: Record<string, unknown> = {
formats: buildFormats(options),
integration: 'cli',
};

if (options.onlyMainContent !== undefined) {
payload.onlyMainContent = options.onlyMainContent;
}
if (options.includeTags && options.includeTags.length > 0) {
payload.includeTags = options.includeTags;
}
if (options.excludeTags && options.excludeTags.length > 0) {
payload.excludeTags = options.excludeTags;
}
if (options.timeout !== undefined) {
payload.timeout = options.timeout;
}
if (options.location) {
payload.location = options.location;
}

return payload;
}

/**
* Execute the parse command by POSTing a multipart upload to /v2/parse.
*/
export async function executeParse(
options: ParseOptions
): Promise<ParseResult> {
const filePath = path.resolve(options.file);

if (!fs.existsSync(filePath)) {
return {
success: false,
error: `File not found: ${options.file}`,
};
}

const stat = fs.statSync(filePath);
if (!stat.isFile()) {
return {
success: false,
error: `Not a file: ${options.file}`,
};
}

const ext = path.extname(filePath).toLowerCase();
if (!SUPPORTED_EXTENSIONS.has(ext)) {
return {
success: false,
error:
`Unsupported file type "${ext || '(none)'}". ` +
`Supported extensions: ${[...SUPPORTED_EXTENSIONS].join(', ')}`,
};
}

// Ensure auth/url is resolved through the same config pipeline scrape uses.
if (options.apiKey || options.apiUrl) {
getClient({ apiKey: options.apiKey, apiUrl: options.apiUrl });
}

const config = getConfig();
const apiKey = options.apiKey || config.apiKey;
validateConfig(apiKey);

const apiUrl = (options.apiUrl || config.apiUrl || DEFAULT_API_URL).replace(
/\/$/,
''
);

const buffer = fs.readFileSync(filePath);
const filename = path.basename(filePath);
const contentType = CONTENT_TYPE_BY_EXT[ext] ?? 'application/octet-stream';

const form = new FormData();
form.append(
'file',
new Blob([new Uint8Array(buffer)], { type: contentType }),
filename
);
form.append('options', JSON.stringify(buildOptionsPayload(options)));

const requestStartTime = Date.now();

try {
const response = await fetch(`${apiUrl}/v2/parse`, {
method: 'POST',
headers: apiKey ? { Authorization: `Bearer ${apiKey}` } : undefined,
body: form,
});

const requestEndTime = Date.now();
outputTiming(options, requestStartTime, requestEndTime);

const payload = (await response.json().catch(() => ({}))) as any;

if (!response.ok || payload?.success === false) {
const message =
payload?.error ||
`HTTP ${response.status}: ${response.statusText || 'Request failed'}`;
return { success: false, error: message };
}

return {
success: true,
data: payload?.data ?? payload,
};
} catch (error) {
const requestEndTime = Date.now();
outputTiming(options, requestStartTime, requestEndTime, error);
return {
success: false,
error: error instanceof Error ? error.message : 'Unknown error occurred',
};
}
}

/**
* Handle parse command output. Reuses the scrape output formatter since the
* /v2/parse response shape matches /v2/scrape.
*/
export async function handleParseCommand(options: ParseOptions): Promise<void> {
const result = await executeParse(options);

if (options.query && result.success && result.data?.answer) {
const { writeOutput } = await import('../utils/output');
writeOutput(result.data.answer, options.output, !!options.output);
return;
}

const effectiveFormats: ScrapeFormat[] =
options.formats && options.formats.length > 0
? [...options.formats]
: ['markdown'];

handleScrapeOutput(
result,
effectiveFormats,
options.output,
options.pretty,
options.json
);
}
Loading
Loading