diff --git a/package.json b/package.json index ee8fa969f..89201d5af 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "firecrawl-cli", - "version": "0.0.2", + "version": "0.0.3", "description": "Command-line interface for Firecrawl. Scrape, crawl, and extract data from any website directly from your terminal.", "main": "dist/index.js", "bin": { diff --git a/src/__tests__/commands/crawl.test.ts b/src/__tests__/commands/crawl.test.ts new file mode 100644 index 000000000..38a06cb15 --- /dev/null +++ b/src/__tests__/commands/crawl.test.ts @@ -0,0 +1,517 @@ +/** + * Tests for crawl command + */ + +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { executeCrawl } from '../../commands/crawl'; +import { getClient } from '../../utils/client'; +import { initializeConfig } from '../../utils/config'; +import { setupTest, teardownTest } from '../utils/mock-client'; + +// Mock the Firecrawl client module +vi.mock('../../utils/client', async () => { + const actual = await vi.importActual('../../utils/client'); + return { + ...actual, + getClient: vi.fn(), + }; +}); + +describe('executeCrawl', () => { + let mockClient: any; + + beforeEach(() => { + setupTest(); + // Initialize config with test API key + initializeConfig({ + apiKey: 'test-api-key', + apiUrl: 'https://api.firecrawl.dev', + }); + + // Create mock client + mockClient = { + startCrawl: vi.fn(), + getCrawlStatus: vi.fn(), + crawl: vi.fn(), + }; + + // Mock getClient to return our mock + vi.mocked(getClient).mockReturnValue(mockClient as any); + }); + + afterEach(() => { + teardownTest(); + vi.clearAllMocks(); + }); + + describe('Start crawl (async)', () => { + it('should call startCrawl with correct URL and return job ID', async () => { + const mockResponse = { + id: '550e8400-e29b-41d4-a716-446655440000', + url: 'https://example.com', + }; + mockClient.startCrawl.mockResolvedValue(mockResponse); + + const result = await executeCrawl({ + urlOrJobId: 'https://example.com', + }); + + expect(mockClient.startCrawl).toHaveBeenCalledTimes(1); + expect(mockClient.startCrawl).toHaveBeenCalledWith( + 'https://example.com', + {} + ); + expect(result).toEqual({ + success: true, + data: { + jobId: mockResponse.id, + url: mockResponse.url, + status: 'processing', + }, + }); + }); + + it('should include limit option when provided', async () => { + const mockResponse = { + id: '550e8400-e29b-41d4-a716-446655440000', + url: 'https://example.com', + }; + mockClient.startCrawl.mockResolvedValue(mockResponse); + + await executeCrawl({ + urlOrJobId: 'https://example.com', + limit: 100, + }); + + expect(mockClient.startCrawl).toHaveBeenCalledWith( + 'https://example.com', + expect.objectContaining({ + limit: 100, + }) + ); + }); + + it('should include maxDepth option when provided', async () => { + const mockResponse = { + id: '550e8400-e29b-41d4-a716-446655440000', + url: 'https://example.com', + }; + mockClient.startCrawl.mockResolvedValue(mockResponse); + + await executeCrawl({ + urlOrJobId: 'https://example.com', + maxDepth: 3, + }); + + expect(mockClient.startCrawl).toHaveBeenCalledWith( + 'https://example.com', + expect.objectContaining({ + maxDiscoveryDepth: 3, + }) + ); + }); + + it('should include excludePaths option when provided', async () => { + const mockResponse = { + id: '550e8400-e29b-41d4-a716-446655440000', + url: 'https://example.com', + }; + mockClient.startCrawl.mockResolvedValue(mockResponse); + + await executeCrawl({ + urlOrJobId: 'https://example.com', + excludePaths: ['/admin', '/private'], + }); + + expect(mockClient.startCrawl).toHaveBeenCalledWith( + 'https://example.com', + expect.objectContaining({ + excludePaths: ['/admin', '/private'], + }) + ); + }); + + it('should include includePaths option when provided', async () => { + const mockResponse = { + id: '550e8400-e29b-41d4-a716-446655440000', + url: 'https://example.com', + }; + mockClient.startCrawl.mockResolvedValue(mockResponse); + + await executeCrawl({ + urlOrJobId: 'https://example.com', + includePaths: ['/blog', '/docs'], + }); + + expect(mockClient.startCrawl).toHaveBeenCalledWith( + 'https://example.com', + expect.objectContaining({ + includePaths: ['/blog', '/docs'], + }) + ); + }); + + it('should include sitemap option when provided', async () => { + const mockResponse = { + id: '550e8400-e29b-41d4-a716-446655440000', + url: 'https://example.com', + }; + mockClient.startCrawl.mockResolvedValue(mockResponse); + + await executeCrawl({ + urlOrJobId: 'https://example.com', + sitemap: 'skip', + }); + + expect(mockClient.startCrawl).toHaveBeenCalledWith( + 'https://example.com', + expect.objectContaining({ + sitemap: 'skip', + }) + ); + }); + + it('should combine all options correctly', async () => { + const mockResponse = { + id: '550e8400-e29b-41d4-a716-446655440000', + url: 'https://example.com', + }; + mockClient.startCrawl.mockResolvedValue(mockResponse); + + await executeCrawl({ + urlOrJobId: 'https://example.com', + limit: 50, + maxDepth: 2, + excludePaths: ['/admin'], + includePaths: ['/blog'], + sitemap: 'include', + ignoreQueryParameters: true, + crawlEntireDomain: false, + allowExternalLinks: false, + allowSubdomains: true, + delay: 1000, + maxConcurrency: 5, + }); + + expect(mockClient.startCrawl).toHaveBeenCalledWith( + 'https://example.com', + { + limit: 50, + maxDiscoveryDepth: 2, + excludePaths: ['/admin'], + includePaths: ['/blog'], + sitemap: 'include', + ignoreQueryParameters: true, + crawlEntireDomain: false, + allowExternalLinks: false, + allowSubdomains: true, + delay: 1000, + maxConcurrency: 5, + } + ); + }); + }); + + describe('Check crawl status', () => { + it('should check status when status flag is set', async () => { + const mockStatus = { + id: '550e8400-e29b-41d4-a716-446655440000', + status: 'completed', + total: 100, + completed: 100, + creditsUsed: 50, + expiresAt: '2024-12-31T23:59:59Z', + }; + mockClient.getCrawlStatus.mockResolvedValue(mockStatus); + + const result = await executeCrawl({ + urlOrJobId: '550e8400-e29b-41d4-a716-446655440000', + status: true, + }); + + expect(mockClient.getCrawlStatus).toHaveBeenCalledTimes(1); + expect(mockClient.getCrawlStatus).toHaveBeenCalledWith( + '550e8400-e29b-41d4-a716-446655440000' + ); + expect(result).toEqual({ + success: true, + data: { + id: mockStatus.id, + status: mockStatus.status, + total: mockStatus.total, + completed: mockStatus.completed, + creditsUsed: mockStatus.creditsUsed, + expiresAt: mockStatus.expiresAt, + }, + }); + }); + + it('should auto-detect job ID from UUID format', async () => { + const mockStatus = { + id: '550e8400-e29b-41d4-a716-446655440000', + status: 'scraping', + total: 100, + completed: 45, + }; + mockClient.getCrawlStatus.mockResolvedValue(mockStatus); + + const result = await executeCrawl({ + urlOrJobId: '550e8400-e29b-41d4-a716-446655440000', + }); + + expect(mockClient.getCrawlStatus).toHaveBeenCalledTimes(1); + expect(result.success).toBe(true); + }); + + it('should handle status check with missing optional fields', async () => { + const mockStatus = { + id: '550e8400-e29b-41d4-a716-446655440000', + status: 'scraping', + total: 100, + completed: 45, + }; + mockClient.getCrawlStatus.mockResolvedValue(mockStatus); + + const result = await executeCrawl({ + urlOrJobId: '550e8400-e29b-41d4-a716-446655440000', + status: true, + }); + + expect(result.success).toBe(true); + if (result.success && 'data' in result) { + expect(result.data?.creditsUsed).toBeUndefined(); + expect(result.data?.expiresAt).toBeUndefined(); + } + }); + }); + + describe('Wait mode (synchronous crawl)', () => { + it('should use crawl method with wait when wait flag is set', async () => { + const mockCrawlJob = { + id: '550e8400-e29b-41d4-a716-446655440000', + status: 'completed', + total: 100, + completed: 100, + data: [{ markdown: '# Page 1' }], + }; + mockClient.crawl.mockResolvedValue(mockCrawlJob); + + const result = await executeCrawl({ + urlOrJobId: 'https://example.com', + wait: true, + }); + + expect(mockClient.crawl).toHaveBeenCalledTimes(1); + expect(mockClient.crawl).toHaveBeenCalledWith( + 'https://example.com', + expect.objectContaining({ + pollInterval: 5000, // Default poll interval + }) + ); + expect(result).toEqual({ + success: true, + data: mockCrawlJob, + }); + }); + + it('should include custom pollInterval when provided', async () => { + const mockCrawlJob = { + id: '550e8400-e29b-41d4-a716-446655440000', + status: 'completed', + total: 100, + completed: 100, + data: [], + }; + mockClient.crawl.mockResolvedValue(mockCrawlJob); + + await executeCrawl({ + urlOrJobId: 'https://example.com', + wait: true, + pollInterval: 10, + }); + + expect(mockClient.crawl).toHaveBeenCalledWith( + 'https://example.com', + expect.objectContaining({ + pollInterval: 10000, // Converted to milliseconds + }) + ); + }); + + it('should include timeout when provided', async () => { + const mockCrawlJob = { + id: '550e8400-e29b-41d4-a716-446655440000', + status: 'completed', + total: 100, + completed: 100, + data: [], + }; + mockClient.crawl.mockResolvedValue(mockCrawlJob); + + await executeCrawl({ + urlOrJobId: 'https://example.com', + wait: true, + timeout: 300, + }); + + expect(mockClient.crawl).toHaveBeenCalledWith( + 'https://example.com', + expect.objectContaining({ + timeout: 300000, // Converted to milliseconds + }) + ); + }); + + it('should combine wait options with crawl options', async () => { + const mockCrawlJob = { + id: '550e8400-e29b-41d4-a716-446655440000', + status: 'completed', + total: 50, + completed: 50, + data: [], + }; + mockClient.crawl.mockResolvedValue(mockCrawlJob); + + await executeCrawl({ + urlOrJobId: 'https://example.com', + wait: true, + pollInterval: 5, + timeout: 600, + limit: 50, + maxDepth: 2, + }); + + expect(mockClient.crawl).toHaveBeenCalledWith( + 'https://example.com', + expect.objectContaining({ + pollInterval: 5000, + timeout: 600000, + limit: 50, + maxDiscoveryDepth: 2, + }) + ); + }); + }); + + describe('Progress mode', () => { + beforeEach(() => { + // Mock process.stderr.write to avoid console output during tests + vi.spyOn(process.stderr, 'write').mockImplementation(() => true); + // Use fake timers to avoid actual waiting + vi.useFakeTimers(); + }); + + afterEach(() => { + vi.restoreAllMocks(); + vi.useRealTimers(); + }); + + it('should use custom polling with progress when progress flag is set', async () => { + const jobId = '550e8400-e29b-41d4-a716-446655440000'; + const mockStartResponse = { + id: jobId, + url: 'https://example.com', + }; + const mockScrapingStatus = { + id: jobId, + status: 'scraping', + total: 100, + completed: 50, + data: [], + }; + const mockCompletedStatus = { + id: jobId, + status: 'completed', + total: 100, + completed: 100, + data: [], + }; + + mockClient.startCrawl.mockResolvedValue(mockStartResponse); + // First call returns scraping status, second returns completed + mockClient.getCrawlStatus + .mockResolvedValueOnce(mockScrapingStatus) + .mockResolvedValueOnce(mockCompletedStatus); + + // Start the async operation + const crawlPromise = executeCrawl({ + urlOrJobId: 'https://example.com', + wait: true, + progress: true, + pollInterval: 0.001, // Very short interval for testing (1ms) + }); + + // Fast-forward timers to resolve the first setTimeout + await vi.advanceTimersByTimeAsync(1); + + // Fast-forward again to resolve the second setTimeout + await vi.advanceTimersByTimeAsync(1); + + const result = await crawlPromise; + + expect(mockClient.startCrawl).toHaveBeenCalledTimes(1); + expect(mockClient.getCrawlStatus).toHaveBeenCalledTimes(2); + expect(result.success).toBe(true); + if (result.success && 'data' in result) { + expect(result.data.status).toBe('completed'); + } + }); + }); + + describe('Error handling', () => { + it('should return error result when startCrawl fails', async () => { + const errorMessage = 'API Error: Invalid URL'; + mockClient.startCrawl.mockRejectedValue(new Error(errorMessage)); + + const result = await executeCrawl({ + urlOrJobId: 'https://example.com', + }); + + expect(result).toEqual({ + success: false, + error: errorMessage, + }); + }); + + it('should return error result when getCrawlStatus fails', async () => { + const errorMessage = 'Job not found'; + mockClient.getCrawlStatus.mockRejectedValue(new Error(errorMessage)); + + const result = await executeCrawl({ + urlOrJobId: '550e8400-e29b-41d4-a716-446655440000', + status: true, + }); + + expect(result).toEqual({ + success: false, + error: errorMessage, + }); + }); + + it('should return error result when crawl fails', async () => { + const errorMessage = 'Crawl timeout'; + mockClient.crawl.mockRejectedValue(new Error(errorMessage)); + + const result = await executeCrawl({ + urlOrJobId: 'https://example.com', + wait: true, + }); + + expect(result).toEqual({ + success: false, + error: errorMessage, + }); + }); + + it('should handle non-Error exceptions', async () => { + mockClient.startCrawl.mockRejectedValue('String error'); + + const result = await executeCrawl({ + urlOrJobId: 'https://example.com', + }); + + expect(result.success).toBe(false); + expect(result.error).toBe('Unknown error occurred'); + }); + }); +}); diff --git a/src/__tests__/commands/map.test.ts b/src/__tests__/commands/map.test.ts new file mode 100644 index 000000000..ea7487e18 --- /dev/null +++ b/src/__tests__/commands/map.test.ts @@ -0,0 +1,351 @@ +/** + * Tests for map command + */ + +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { executeMap } from '../../commands/map'; +import { getClient } from '../../utils/client'; +import { initializeConfig } from '../../utils/config'; +import { setupTest, teardownTest } from '../utils/mock-client'; + +// Mock the Firecrawl client module +vi.mock('../../utils/client', async () => { + const actual = await vi.importActual('../../utils/client'); + return { + ...actual, + getClient: vi.fn(), + }; +}); + +describe('executeMap', () => { + let mockClient: any; + + beforeEach(() => { + setupTest(); + // Initialize config with test API key + initializeConfig({ + apiKey: 'test-api-key', + apiUrl: 'https://api.firecrawl.dev', + }); + + // Create mock client + mockClient = { + map: vi.fn(), + }; + + // Mock getClient to return our mock + vi.mocked(getClient).mockReturnValue(mockClient as any); + }); + + afterEach(() => { + teardownTest(); + vi.clearAllMocks(); + }); + + describe('API call generation', () => { + it('should call map with correct URL and default options', async () => { + const mockResponse = { + links: [ + { url: 'https://example.com/page1', title: 'Page 1' }, + { url: 'https://example.com/page2', title: 'Page 2' }, + ], + }; + mockClient.map.mockResolvedValue(mockResponse); + + await executeMap({ + urlOrJobId: 'https://example.com', + }); + + expect(mockClient.map).toHaveBeenCalledTimes(1); + expect(mockClient.map).toHaveBeenCalledWith('https://example.com', {}); + }); + + it('should include limit option when provided', async () => { + const mockResponse = { + links: [{ url: 'https://example.com/page1' }], + }; + mockClient.map.mockResolvedValue(mockResponse); + + await executeMap({ + urlOrJobId: 'https://example.com', + limit: 50, + }); + + expect(mockClient.map).toHaveBeenCalledWith( + 'https://example.com', + expect.objectContaining({ + limit: 50, + }) + ); + }); + + it('should include search option when provided', async () => { + const mockResponse = { + links: [{ url: 'https://example.com/blog' }], + }; + mockClient.map.mockResolvedValue(mockResponse); + + await executeMap({ + urlOrJobId: 'https://example.com', + search: 'blog', + }); + + expect(mockClient.map).toHaveBeenCalledWith( + 'https://example.com', + expect.objectContaining({ + search: 'blog', + }) + ); + }); + + it('should include sitemap option when provided', async () => { + const mockResponse = { + links: [{ url: 'https://example.com/page1' }], + }; + mockClient.map.mockResolvedValue(mockResponse); + + await executeMap({ + urlOrJobId: 'https://example.com', + sitemap: 'only', + }); + + expect(mockClient.map).toHaveBeenCalledWith( + 'https://example.com', + expect.objectContaining({ + sitemap: 'only', + }) + ); + }); + + it('should include includeSubdomains option when provided', async () => { + const mockResponse = { + links: [{ url: 'https://sub.example.com/page1' }], + }; + mockClient.map.mockResolvedValue(mockResponse); + + await executeMap({ + urlOrJobId: 'https://example.com', + includeSubdomains: true, + }); + + expect(mockClient.map).toHaveBeenCalledWith( + 'https://example.com', + expect.objectContaining({ + includeSubdomains: true, + }) + ); + }); + + it('should include ignoreQueryParameters option when provided', async () => { + const mockResponse = { + links: [{ url: 'https://example.com/page1' }], + }; + mockClient.map.mockResolvedValue(mockResponse); + + await executeMap({ + urlOrJobId: 'https://example.com', + ignoreQueryParameters: true, + }); + + expect(mockClient.map).toHaveBeenCalledWith( + 'https://example.com', + expect.objectContaining({ + ignoreQueryParameters: true, + }) + ); + }); + + it('should include timeout option when provided', async () => { + const mockResponse = { + links: [{ url: 'https://example.com/page1' }], + }; + mockClient.map.mockResolvedValue(mockResponse); + + await executeMap({ + urlOrJobId: 'https://example.com', + timeout: 60, + }); + + expect(mockClient.map).toHaveBeenCalledWith( + 'https://example.com', + expect.objectContaining({ + timeout: 60000, // Converted to milliseconds + }) + ); + }); + + it('should combine all options correctly', async () => { + const mockResponse = { + links: [ + { url: 'https://example.com/blog/post1' }, + { url: 'https://example.com/blog/post2' }, + ], + }; + mockClient.map.mockResolvedValue(mockResponse); + + await executeMap({ + urlOrJobId: 'https://example.com', + limit: 100, + search: 'blog', + sitemap: 'include', + includeSubdomains: true, + ignoreQueryParameters: true, + timeout: 120, + }); + + expect(mockClient.map).toHaveBeenCalledWith('https://example.com', { + limit: 100, + search: 'blog', + sitemap: 'include', + includeSubdomains: true, + ignoreQueryParameters: true, + timeout: 120000, + }); + }); + }); + + describe('Response handling', () => { + it('should return success result with mapped links', async () => { + const mockResponse = { + links: [ + { + url: 'https://example.com/page1', + title: 'Page 1', + description: 'Description 1', + }, + { + url: 'https://example.com/page2', + title: 'Page 2', + description: 'Description 2', + }, + ], + }; + mockClient.map.mockResolvedValue(mockResponse); + + const result = await executeMap({ + urlOrJobId: 'https://example.com', + }); + + expect(result).toEqual({ + success: true, + data: { + links: [ + { + url: 'https://example.com/page1', + title: 'Page 1', + description: 'Description 1', + }, + { + url: 'https://example.com/page2', + title: 'Page 2', + description: 'Description 2', + }, + ], + }, + }); + }); + + it('should handle links without title or description', async () => { + const mockResponse = { + links: [ + { url: 'https://example.com/page1' }, + { + url: 'https://example.com/page2', + title: 'Page 2', + }, + ], + }; + mockClient.map.mockResolvedValue(mockResponse); + + const result = await executeMap({ + urlOrJobId: 'https://example.com', + }); + + expect(result.success).toBe(true); + if (result.success && result.data) { + expect(result.data.links).toHaveLength(2); + expect(result.data.links[0]).toEqual({ + url: 'https://example.com/page1', + title: undefined, + description: undefined, + }); + expect(result.data.links[1]).toEqual({ + url: 'https://example.com/page2', + title: 'Page 2', + description: undefined, + }); + } + }); + + it('should handle empty links array', async () => { + const mockResponse = { + links: [], + }; + mockClient.map.mockResolvedValue(mockResponse); + + const result = await executeMap({ + urlOrJobId: 'https://example.com', + }); + + expect(result.success).toBe(true); + if (result.success && result.data) { + expect(result.data.links).toEqual([]); + } + }); + + it('should return error result when map fails', async () => { + const errorMessage = 'API Error: Invalid URL'; + mockClient.map.mockRejectedValue(new Error(errorMessage)); + + const result = await executeMap({ + urlOrJobId: 'https://example.com', + }); + + expect(result).toEqual({ + success: false, + error: errorMessage, + }); + }); + + it('should handle non-Error exceptions', async () => { + mockClient.map.mockRejectedValue('String error'); + + const result = await executeMap({ + urlOrJobId: 'https://example.com', + }); + + expect(result.success).toBe(false); + expect(result.error).toBe('Unknown error occurred'); + }); + }); + + describe('Data transformation', () => { + it('should transform links to expected format', async () => { + const mockResponse = { + links: [ + { + url: 'https://example.com/page1', + title: 'Page 1', + description: 'Description 1', + otherField: 'should be ignored', + }, + ], + }; + mockClient.map.mockResolvedValue(mockResponse); + + const result = await executeMap({ + urlOrJobId: 'https://example.com', + }); + + expect(result.success).toBe(true); + if (result.success && result.data) { + expect(result.data.links[0]).toEqual({ + url: 'https://example.com/page1', + title: 'Page 1', + description: 'Description 1', + }); + expect(result.data.links[0]).not.toHaveProperty('otherField'); + } + }); + }); +}); diff --git a/src/__tests__/utils/job.test.ts b/src/__tests__/utils/job.test.ts new file mode 100644 index 000000000..0fee97ec1 --- /dev/null +++ b/src/__tests__/utils/job.test.ts @@ -0,0 +1,59 @@ +/** + * Tests for job utility functions + */ + +import { describe, it, expect } from 'vitest'; +import { isJobId, isValidUrl } from '../../utils/job'; + +describe('isJobId', () => { + it('should return true for valid UUID v4 format', () => { + expect(isJobId('550e8400-e29b-41d4-a716-446655440000')).toBe(true); + expect(isJobId('123e4567-e89b-42d3-a456-426614174000')).toBe(true); // Fixed: version digit must be 4 + expect(isJobId('00000000-0000-4000-8000-000000000000')).toBe(true); + expect(isJobId('ffffffff-ffff-4fff-8fff-ffffffffffff')).toBe(true); + }); + + it('should return false for invalid UUID formats', () => { + expect(isJobId('not-a-uuid')).toBe(false); + expect(isJobId('550e8400-e29b-41d4-a716')).toBe(false); + expect(isJobId('550e8400-e29b-41d4-a716-446655440000-extra')).toBe(false); + expect(isJobId('')).toBe(false); + }); + + it('should return false for URLs', () => { + expect(isJobId('https://example.com')).toBe(false); + expect(isJobId('http://example.com/page')).toBe(false); + }); + + it('should be case-insensitive', () => { + expect(isJobId('550E8400-E29B-41D4-A716-446655440000')).toBe(true); + expect(isJobId('550e8400-E29b-41d4-A716-446655440000')).toBe(true); + }); + + it('should return false for UUID v1 format', () => { + // UUID v1 has different version number (1 instead of 4) + expect(isJobId('550e8400-e29b-11d4-a716-446655440000')).toBe(false); + }); +}); + +describe('isValidUrl', () => { + it('should return true for valid HTTP URLs', () => { + expect(isValidUrl('http://example.com')).toBe(true); + expect(isValidUrl('https://example.com')).toBe(true); + expect(isValidUrl('https://example.com/path')).toBe(true); + expect(isValidUrl('https://example.com/path?query=value')).toBe(true); + expect(isValidUrl('https://example.com:8080/path')).toBe(true); + }); + + it('should return false for invalid URLs', () => { + expect(isValidUrl('not-a-url')).toBe(false); + expect(isValidUrl('example.com')).toBe(false); + expect(isValidUrl('')).toBe(false); + expect(isValidUrl('ftp://example.com')).toBe(true); // Still valid URL + }); + + it('should handle edge cases', () => { + expect(isValidUrl('http://')).toBe(false); + expect(isValidUrl('https://')).toBe(false); + }); +}); diff --git a/src/commands/crawl.ts b/src/commands/crawl.ts new file mode 100644 index 000000000..c75d6c1f8 --- /dev/null +++ b/src/commands/crawl.ts @@ -0,0 +1,280 @@ +/** + * Crawl command implementation + */ + +import type { + CrawlOptions, + CrawlResult, + CrawlStatusResult, +} from '../types/crawl'; +import { getClient } from '../utils/client'; +import { updateConfig } from '../utils/config'; +import { isJobId } from '../utils/job'; +import { writeOutput } from '../utils/output'; + +/** + * Execute crawl status check + */ +async function checkCrawlStatus( + jobId: string, + options: CrawlOptions +): Promise { + try { + const app = getClient(); + const status = await app.getCrawlStatus(jobId); + + return { + success: true, + data: { + id: status.id, + status: status.status, + total: status.total, + completed: status.completed, + creditsUsed: status.creditsUsed, + expiresAt: status.expiresAt, + }, + }; + } catch (error) { + return { + success: false, + error: error instanceof Error ? error.message : 'Unknown error occurred', + }; + } +} + +/** + * Execute crawl command + */ +export async function executeCrawl( + options: CrawlOptions +): Promise { + try { + // Update global config if API key is provided + if (options.apiKey) { + updateConfig({ apiKey: options.apiKey }); + } + + const app = getClient(); + const { urlOrJobId, status, wait, pollInterval, timeout } = options; + + // If status flag is set or input looks like a job ID, check status + if (status || isJobId(urlOrJobId)) { + return await checkCrawlStatus(urlOrJobId, options); + } + + // Build crawl options + const crawlOptions: any = {}; + + if (options.limit !== undefined) { + crawlOptions.limit = options.limit; + } + if (options.maxDepth !== undefined) { + crawlOptions.maxDiscoveryDepth = options.maxDepth; + } + if (options.excludePaths && options.excludePaths.length > 0) { + crawlOptions.excludePaths = options.excludePaths; + } + if (options.includePaths && options.includePaths.length > 0) { + crawlOptions.includePaths = options.includePaths; + } + if (options.sitemap) { + crawlOptions.sitemap = options.sitemap; + } + if (options.ignoreQueryParameters !== undefined) { + crawlOptions.ignoreQueryParameters = options.ignoreQueryParameters; + } + if (options.crawlEntireDomain !== undefined) { + crawlOptions.crawlEntireDomain = options.crawlEntireDomain; + } + if (options.allowExternalLinks !== undefined) { + crawlOptions.allowExternalLinks = options.allowExternalLinks; + } + if (options.allowSubdomains !== undefined) { + crawlOptions.allowSubdomains = options.allowSubdomains; + } + if (options.delay !== undefined) { + crawlOptions.delay = options.delay; + } + if (options.maxConcurrency !== undefined) { + crawlOptions.maxConcurrency = options.maxConcurrency; + } + + // If wait mode, use the convenience crawl method with polling + if (wait) { + // Set polling options + if (pollInterval !== undefined) { + crawlOptions.pollInterval = pollInterval * 1000; // Convert to milliseconds + } else { + // Default poll interval: 5 seconds + crawlOptions.pollInterval = 5000; + } + if (timeout !== undefined) { + crawlOptions.timeout = timeout * 1000; // Convert to milliseconds + } + + // Show progress if requested - use custom polling for better UX + if (options.progress) { + // Start crawl first + const response = await app.startCrawl(urlOrJobId, crawlOptions); + const jobId = response.id; + + process.stderr.write(`Crawling ${urlOrJobId}...\n`); + process.stderr.write(`Job ID: ${jobId}\n`); + + // Poll for status with progress updates + const pollMs = crawlOptions.pollInterval || 5000; + const startTime = Date.now(); + const timeoutMs = timeout ? timeout * 1000 : undefined; + + while (true) { + await new Promise((resolve) => setTimeout(resolve, pollMs)); + + const status = await app.getCrawlStatus(jobId); + + // Show progress + process.stderr.write( + `\rProgress: ${status.completed}/${status.total} pages (${status.status})` + ); + + if ( + status.status === 'completed' || + status.status === 'failed' || + status.status === 'cancelled' + ) { + process.stderr.write('\n'); + return { + success: true, + data: status, + }; + } + + // Check timeout + if (timeoutMs && Date.now() - startTime > timeoutMs) { + process.stderr.write('\n'); + return { + success: false, + error: `Timeout after ${timeout} seconds. Crawl still in progress.`, + }; + } + } + } else { + // Use SDK's built-in polling (no progress display) + const crawlJob = await app.crawl(urlOrJobId, crawlOptions); + return { + success: true, + data: crawlJob, + }; + } + } + + // Otherwise, start crawl and return job ID + const response = await app.startCrawl(urlOrJobId, crawlOptions); + + return { + success: true, + data: { + jobId: response.id, + url: response.url, + status: 'processing', + }, + }; + } catch (error) { + return { + success: false, + error: error instanceof Error ? error.message : 'Unknown error occurred', + }; + } +} + +/** + * Format crawl status in human-readable way + */ +function formatCrawlStatus(data: CrawlStatusResult['data']): string { + if (!data) return ''; + + const lines: string[] = []; + lines.push(`Job ID: ${data.id}`); + lines.push(`Status: ${data.status}`); + lines.push(`Progress: ${data.completed}/${data.total} pages`); + + if (data.creditsUsed !== undefined) { + lines.push(`Credits Used: ${data.creditsUsed}`); + } + + if (data.expiresAt) { + const expiresDate = new Date(data.expiresAt); + lines.push( + `Expires: ${expiresDate.toLocaleString('en-US', { + year: 'numeric', + month: 'short', + day: 'numeric', + hour: '2-digit', + minute: '2-digit', + })}` + ); + } + + return lines.join('\n') + '\n'; +} + +/** + * Handle crawl command output + */ +export async function handleCrawlCommand(options: CrawlOptions): Promise { + const result = await executeCrawl(options); + + if (!result.success) { + console.error('Error:', result.error); + process.exit(1); + } + + // Handle status check result + if ('status' in result && result.data && 'status' in result.data) { + const statusResult = result as CrawlStatusResult; + if (statusResult.data) { + let outputContent: string; + + if (options.pretty || !options.output) { + // Human-readable format for status + outputContent = formatCrawlStatus(statusResult.data); + } else { + // JSON format + outputContent = options.pretty + ? JSON.stringify({ success: true, data: statusResult.data }, null, 2) + : JSON.stringify({ success: true, data: statusResult.data }); + } + + writeOutput(outputContent, options.output, !!options.output); + return; + } + } + + // Handle crawl result (job ID or completed crawl) + const crawlResult = result as CrawlResult; + if (!crawlResult.data) { + return; + } + + let outputContent: string; + + // If it's a job ID response (has jobId field) + if ('jobId' in crawlResult.data) { + const jobData = { + jobId: crawlResult.data.jobId, + url: crawlResult.data.url, + status: crawlResult.data.status, + }; + + outputContent = options.pretty + ? JSON.stringify({ success: true, data: jobData }, null, 2) + : JSON.stringify({ success: true, data: jobData }); + } else { + // Completed crawl - output the data + // For completed crawls, output as JSON + outputContent = options.pretty + ? JSON.stringify(crawlResult.data, null, 2) + : JSON.stringify(crawlResult.data); + } + + writeOutput(outputContent, options.output, !!options.output); +} diff --git a/src/commands/map.ts b/src/commands/map.ts new file mode 100644 index 000000000..e1d80704c --- /dev/null +++ b/src/commands/map.ts @@ -0,0 +1,104 @@ +/** + * Map command implementation + */ + +import type { MapOptions, MapResult } from '../types/map'; +import { getClient } from '../utils/client'; +import { updateConfig } from '../utils/config'; +import { writeOutput } from '../utils/output'; + +/** + * Execute map command + */ +export async function executeMap(options: MapOptions): Promise { + try { + // Update global config if API key is provided + if (options.apiKey) { + updateConfig({ apiKey: options.apiKey }); + } + + const app = getClient(); + const { urlOrJobId } = options; + + // Build map options + const mapOptions: any = {}; + + if (options.limit !== undefined) { + mapOptions.limit = options.limit; + } + if (options.search) { + mapOptions.search = options.search; + } + if (options.sitemap) { + mapOptions.sitemap = options.sitemap; + } + if (options.includeSubdomains !== undefined) { + mapOptions.includeSubdomains = options.includeSubdomains; + } + if (options.ignoreQueryParameters !== undefined) { + mapOptions.ignoreQueryParameters = options.ignoreQueryParameters; + } + if (options.timeout !== undefined) { + mapOptions.timeout = options.timeout * 1000; // Convert to milliseconds + } + + // Execute map (seems synchronous in SDK) + const mapData = await app.map(urlOrJobId, mapOptions); + + return { + success: true, + data: { + links: mapData.links.map((link: any) => ({ + url: link.url, + title: link.title, + description: link.description, + })), + }, + }; + } catch (error) { + return { + success: false, + error: error instanceof Error ? error.message : 'Unknown error occurred', + }; + } +} + +/** + * Format map data in human-readable way + */ +function formatMapReadable(data: MapResult['data']): string { + if (!data || !data.links) return ''; + + // Output one URL per line (like curl) + return data.links.map((link) => link.url).join('\n') + '\n'; +} + +/** + * Handle map command output + */ +export async function handleMapCommand(options: MapOptions): Promise { + const result = await executeMap(options); + + if (!result.success) { + console.error('Error:', result.error); + process.exit(1); + } + + if (!result.data) { + return; + } + + let outputContent: string; + + // Use JSON format if --json flag is set + if (options.json) { + outputContent = options.pretty + ? JSON.stringify({ success: true, data: result.data }, null, 2) + : JSON.stringify({ success: true, data: result.data }); + } else { + // Default to human-readable format (one URL per line) + outputContent = formatMapReadable(result.data); + } + + writeOutput(outputContent, options.output, !!options.output); +} diff --git a/src/index.ts b/src/index.ts index 6c8407574..042692768 100644 --- a/src/index.ts +++ b/src/index.ts @@ -10,8 +10,11 @@ import { handleScrapeCommand } from './commands/scrape'; import { initializeConfig, updateConfig } from './utils/config'; import { configure } from './commands/config'; import { handleCreditUsageCommand } from './commands/credit-usage'; +import { handleCrawlCommand } from './commands/crawl'; +import { handleMapCommand } from './commands/map'; import { isUrl, normalizeUrl } from './utils/url'; import { parseScrapeOptions } from './utils/options'; +import { isJobId } from './utils/job'; // Initialize global configuration from environment variables initializeConfig(); @@ -90,6 +93,175 @@ function createScrapeCommand(): Command { // Add scrape command to main program program.addCommand(createScrapeCommand()); +/** + * Create and configure the crawl command + */ +function createCrawlCommand(): Command { + const crawlCmd = new Command('crawl') + .description('Crawl a website using Firecrawl') + .argument('[url-or-job-id]', 'URL to crawl or job ID to check status') + .option( + '-u, --url ', + 'URL to crawl (alternative to positional argument)' + ) + .option('--status', 'Check status of existing crawl job', false) + .option( + '--wait', + 'Wait for crawl to complete before returning results', + false + ) + .option( + '--poll-interval ', + 'Polling interval in seconds when waiting (default: 5)', + parseFloat + ) + .option( + '--timeout ', + 'Timeout in seconds when waiting (default: no timeout)', + parseFloat + ) + .option('--progress', 'Show progress dots while waiting', false) + .option('--limit ', 'Maximum number of pages to crawl', parseInt) + .option('--max-depth ', 'Maximum crawl depth', parseInt) + .option( + '--exclude-paths ', + 'Comma-separated list of paths to exclude' + ) + .option( + '--include-paths ', + 'Comma-separated list of paths to include' + ) + .option('--sitemap ', 'Sitemap handling: skip, include', 'include') + .option( + '--ignore-query-parameters', + 'Ignore query parameters when crawling', + false + ) + .option('--crawl-entire-domain', 'Crawl entire domain', false) + .option('--allow-external-links', 'Allow external links', false) + .option('--allow-subdomains', 'Allow subdomains', false) + .option('--delay ', 'Delay between requests in milliseconds', parseInt) + .option( + '--max-concurrency ', + 'Maximum concurrent requests', + parseInt + ) + .option( + '-k, --api-key ', + 'Firecrawl API key (overrides global --api-key)' + ) + .option('-o, --output ', 'Output file path (default: stdout)') + .option('--pretty', 'Pretty print JSON output', false) + .action(async (positionalUrlOrJobId, options) => { + // Use positional argument if provided, otherwise use --url option + const urlOrJobId = positionalUrlOrJobId || options.url; + if (!urlOrJobId) { + console.error( + 'Error: URL or job ID is required. Provide it as argument or use --url option.' + ); + process.exit(1); + } + + // Auto-detect if it's a job ID (UUID format) + const isStatusCheck = options.status || isJobId(urlOrJobId); + + const crawlOptions = { + urlOrJobId, + status: isStatusCheck, + wait: options.wait, + pollInterval: options.pollInterval, + timeout: options.timeout, + progress: options.progress, + output: options.output, + pretty: options.pretty, + apiKey: options.apiKey, + limit: options.limit, + maxDepth: options.maxDepth, + excludePaths: options.excludePaths + ? options.excludePaths.split(',').map((p: string) => p.trim()) + : undefined, + includePaths: options.includePaths + ? options.includePaths.split(',').map((p: string) => p.trim()) + : undefined, + sitemap: options.sitemap, + ignoreQueryParameters: options.ignoreQueryParameters, + crawlEntireDomain: options.crawlEntireDomain, + allowExternalLinks: options.allowExternalLinks, + allowSubdomains: options.allowSubdomains, + delay: options.delay, + maxConcurrency: options.maxConcurrency, + }; + + await handleCrawlCommand(crawlOptions); + }); + + return crawlCmd; +} + +/** + * Create and configure the map command + */ +function createMapCommand(): Command { + const mapCmd = new Command('map') + .description('Map URLs on a website using Firecrawl') + .argument('[url]', 'URL to map') + .option( + '-u, --url ', + 'URL to map (alternative to positional argument)' + ) + .option('--wait', 'Wait for map to complete', false) + .option('--limit ', 'Maximum URLs to discover', parseInt) + .option('--search ', 'Search query to filter URLs') + .option( + '--sitemap ', + 'Sitemap handling: only, include, skip', + 'include' + ) + .option('--include-subdomains', 'Include subdomains', false) + .option('--ignore-query-parameters', 'Ignore query parameters', false) + .option('--timeout ', 'Timeout in seconds', parseFloat) + .option( + '-k, --api-key ', + 'Firecrawl API key (overrides global --api-key)' + ) + .option('-o, --output ', 'Output file path (default: stdout)') + .option('--json', 'Output as JSON format', false) + .option('--pretty', 'Pretty print JSON output', false) + .action(async (positionalUrl, options) => { + // Use positional URL if provided, otherwise use --url option + const url = positionalUrl || options.url; + if (!url) { + console.error( + 'Error: URL is required. Provide it as argument or use --url option.' + ); + process.exit(1); + } + + const mapOptions = { + urlOrJobId: url, + wait: options.wait, + output: options.output, + json: options.json, + pretty: options.pretty, + apiKey: options.apiKey, + limit: options.limit, + search: options.search, + sitemap: options.sitemap, + includeSubdomains: options.includeSubdomains, + ignoreQueryParameters: options.ignoreQueryParameters, + timeout: options.timeout, + }; + + await handleMapCommand(mapOptions); + }); + + return mapCmd; +} + +// Add crawl and map commands to main program +program.addCommand(createCrawlCommand()); +program.addCommand(createMapCommand()); + program .command('config') .description('Configure API URL and API key (interactive)') diff --git a/src/types/crawl.ts b/src/types/crawl.ts new file mode 100644 index 000000000..4efca802d --- /dev/null +++ b/src/types/crawl.ts @@ -0,0 +1,65 @@ +/** + * Types for crawl command + */ + +export interface CrawlOptions { + /** API key for Firecrawl */ + apiKey?: string; + /** URL to crawl or job ID to check status */ + urlOrJobId: string; + /** Check status of existing crawl job */ + status?: boolean; + /** Wait for crawl to complete */ + wait?: boolean; + /** Polling interval in seconds when waiting */ + pollInterval?: number; + /** Timeout in seconds when waiting */ + timeout?: number; + /** Show progress dots while waiting */ + progress?: boolean; + /** Output file path */ + output?: string; + /** Pretty print JSON output */ + pretty?: boolean; + /** Maximum number of pages to crawl */ + limit?: number; + /** Maximum crawl depth */ + maxDepth?: number; + /** Exclude paths */ + excludePaths?: string[]; + /** Include paths */ + includePaths?: string[]; + /** Sitemap handling */ + sitemap?: 'skip' | 'include'; + /** Ignore query parameters */ + ignoreQueryParameters?: boolean; + /** Crawl entire domain */ + crawlEntireDomain?: boolean; + /** Allow external links */ + allowExternalLinks?: boolean; + /** Allow subdomains */ + allowSubdomains?: boolean; + /** Delay between requests */ + delay?: number; + /** Maximum concurrency */ + maxConcurrency?: number; +} + +export interface CrawlResult { + success: boolean; + data?: any; + error?: string; +} + +export interface CrawlStatusResult { + success: boolean; + data?: { + id: string; + status: 'scraping' | 'completed' | 'failed' | 'cancelled'; + total: number; + completed: number; + creditsUsed?: number; + expiresAt?: string; + }; + error?: string; +} diff --git a/src/types/map.ts b/src/types/map.ts new file mode 100644 index 000000000..843a30550 --- /dev/null +++ b/src/types/map.ts @@ -0,0 +1,44 @@ +/** + * Types for map command + */ + +export interface MapOptions { + /** API key for Firecrawl */ + apiKey?: string; + /** URL to map or job ID to check status */ + urlOrJobId: string; + /** Check status of existing map job */ + status?: boolean; + /** Wait for map to complete */ + wait?: boolean; + /** Output file path */ + output?: string; + /** Output as JSON format */ + json?: boolean; + /** Pretty print JSON output */ + pretty?: boolean; + /** Maximum URLs to discover */ + limit?: number; + /** Search query */ + search?: string; + /** Sitemap handling */ + sitemap?: 'only' | 'include' | 'skip'; + /** Include subdomains */ + includeSubdomains?: boolean; + /** Ignore query parameters */ + ignoreQueryParameters?: boolean; + /** Timeout in seconds */ + timeout?: number; +} + +export interface MapResult { + success: boolean; + data?: { + links: Array<{ + url: string; + title?: string; + description?: string; + }>; + }; + error?: string; +} diff --git a/src/utils/job.ts b/src/utils/job.ts new file mode 100644 index 000000000..5b4b1f7d5 --- /dev/null +++ b/src/utils/job.ts @@ -0,0 +1,26 @@ +/** + * Utility functions for job ID detection and validation + */ + +/** + * Check if a string looks like a UUID/job ID + * Firecrawl job IDs are UUIDs (e.g., "550e8400-e29b-41d4-a716-446655440000") + */ +export function isJobId(str: string): boolean { + // UUID v4 pattern + const uuidPattern = + /^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/i; + return uuidPattern.test(str); +} + +/** + * Check if a string is a valid URL + */ +export function isValidUrl(str: string): boolean { + try { + new URL(str); + return true; + } catch { + return false; + } +}