diff --git a/composer.json b/composer.json index 1d447c3..3421acb 100644 --- a/composer.json +++ b/composer.json @@ -34,6 +34,7 @@ "shipfastlabs/toolkit-calculator": "self.version", "shipfastlabs/toolkit-database": "self.version", "shipfastlabs/toolkit-exa": "self.version", + "shipfastlabs/toolkit-firecrawl": "self.version", "shipfastlabs/toolkit-jigsawstack": "self.version", "shipfastlabs/toolkit-perplexity": "self.version", "shipfastlabs/toolkit-stub": "self.version", @@ -47,6 +48,7 @@ "Shipfastlabs\\Toolkit\\Calculator\\": "src/Calculator/src/", "Shipfastlabs\\Toolkit\\Database\\": "src/Database/src/", "Shipfastlabs\\Toolkit\\Exa\\": "src/Exa/src/", + "Shipfastlabs\\Toolkit\\Firecrawl\\": "src/Firecrawl/src/", "Shipfastlabs\\Toolkit\\JigsawStack\\": "src/JigsawStack/src/", "Shipfastlabs\\Toolkit\\Perplexity\\": "src/Perplexity/src/", "Shipfastlabs\\Toolkit\\Tavily\\": "src/Tavily/src/" @@ -57,6 +59,7 @@ "Shipfastlabs\\Toolkit\\Calculator\\Tests\\": "src/Calculator/tests/", "Shipfastlabs\\Toolkit\\Database\\Tests\\": "src/Database/tests/", "Shipfastlabs\\Toolkit\\Exa\\Tests\\": "src/Exa/tests/", + "Shipfastlabs\\Toolkit\\Firecrawl\\Tests\\": "src/Firecrawl/tests/", "Shipfastlabs\\Toolkit\\JigsawStack\\Tests\\": "src/JigsawStack/tests/", "Shipfastlabs\\Toolkit\\Perplexity\\Tests\\": "src/Perplexity/tests/", "Shipfastlabs\\Toolkit\\Tavily\\Tests\\": "src/Tavily/tests/", diff --git a/src/Firecrawl/README.md b/src/Firecrawl/README.md new file mode 100644 index 0000000..9ad0bd5 --- /dev/null +++ b/src/Firecrawl/README.md @@ -0,0 +1,197 @@ +# shipfastlabs/toolkit-firecrawl + +[![Latest Version](https://img.shields.io/packagist/v/shipfastlabs/toolkit-firecrawl.svg)](https://packagist.org/packages/shipfastlabs/toolkit-firecrawl) +[![Total Downloads](https://img.shields.io/packagist/dt/shipfastlabs/toolkit-firecrawl.svg)](https://packagist.org/packages/shipfastlabs/toolkit-firecrawl) + +> Firecrawl tools for the Laravel AI SDK - Scrape, Map, Search, and Crawl + +Part of the [shipfastlabs/toolkit](https://github.com/shipfastlabs/toolkit) catalog of reusable AI tools for the Laravel AI SDK. + + + +## Installation + +```bash +composer require shipfastlabs/toolkit-firecrawl +``` + +## Usage + +Register every Firecrawl tool at once with the `Firecrawl` helper: + +```php +use Shipfastlabs\Toolkit\Firecrawl\Firecrawl; + +$tools = Firecrawl::all(); // Collection +``` + +Or add individual tools to an agent's `tools()`: + +```php +use Shipfastlabs\Toolkit\Firecrawl\FirecrawlScrape; +use Shipfastlabs\Toolkit\Firecrawl\FirecrawlMap; +use Shipfastlabs\Toolkit\Firecrawl\FirecrawlSearch; +use Shipfastlabs\Toolkit\Firecrawl\FirecrawlCrawl; +use Shipfastlabs\Toolkit\Firecrawl\FirecrawlCrawlStatus; + +$tools = [ + new FirecrawlScrape, + new FirecrawlMap, + new FirecrawlSearch, + new FirecrawlCrawl, + new FirecrawlCrawlStatus, +]; +``` + +Each tool calls a Firecrawl v2 endpoint and returns the raw JSON response (pretty-printed) so the model can read every field, or a friendly error string it can recover from. + +## Tools + +### FirecrawlScrape + +Scrape a single page into clean, LLM-ready markdown. Renders JavaScript, so it works on dynamic pages. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `url` | string | yes | The URL of the web page to scrape. | +| `formats` | string | no | Comma-separated output formats, any of `markdown`, `html`, `rawHtml`, `links`, `summary` (default: `markdown`). | +| `only_main_content` | boolean | no | Return only the main content, stripping navigation, headers and footers (default: `true`). | + +### FirecrawlMap + +Map a website and return its list of URLs. Use it to discover every page before deciding what to scrape. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `url` | string | yes | The base URL of the website to map. | +| `search` | string | no | Only return URLs that match this search term (default: no filter). | +| `limit` | integer | no | Maximum number of URLs to return (1-5000, default: 100). | + +### FirecrawlSearch + +Search the web and return ranked results, optionally with each page scraped into markdown. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `query` | string | yes | The search query to look up on the web. | +| `limit` | integer | no | Maximum number of results to return (1-20, default: 5). | +| `sources` | string | no | Comma-separated result sources, any of `web`, `news`, `images` (default: `web`). | +| `scrape_content` | boolean | no | Scrape each result and include its markdown content, not just the link (default: `false`). | + +### FirecrawlCrawl + +Start crawling a website. Crawling is **asynchronous**: this tool returns a crawl `id` immediately and does not wait for the crawl to finish. Pass that `id` to `FirecrawlCrawlStatus` to poll progress and collect results. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `url` | string | yes | The URL to start crawling from. | +| `limit` | integer | no | Maximum number of pages to crawl (1-1000, default: 10). | +| `prompt` | string | no | Natural-language instructions to steer the crawl, e.g. `"only blog posts"`. | + +### FirecrawlCrawlStatus + +Check the status of a crawl started with `FirecrawlCrawl`. Returns the status (`scraping` or `completed`), progress, and the pages crawled so far. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `crawl_id` | string | yes | The crawl id returned by `FirecrawlCrawl`. | + +## Configuration + +Every tool reads its API key from Laravel's `services` config and its optional defaults from the `ai` config. + +### 1. Add the Firecrawl service to `config/services.php` + +```php +// config/services.php + +return [ + + // ... existing services ... + + 'firecrawl' => [ + 'key' => env('FIRECRAWL_API_KEY'), + ], + +]; +``` + +### 2. Add toolkit defaults to `config/ai.php` + +```php +// config/ai.php + +return [ + + // ... existing laravel/ai config ... + + 'toolkit' => [ + 'firecrawl' => [ + 'scrape' => [ + 'formats' => env('FIRECRAWL_SCRAPE_FORMATS', 'markdown'), + 'only_main_content' => (bool) env('FIRECRAWL_SCRAPE_ONLY_MAIN_CONTENT', true), + ], + 'map' => [ + 'limit' => (int) env('FIRECRAWL_MAP_LIMIT', 100), + ], + 'search' => [ + 'limit' => (int) env('FIRECRAWL_SEARCH_LIMIT', 5), + 'sources' => env('FIRECRAWL_SEARCH_SOURCES', 'web'), + ], + 'crawl' => [ + 'limit' => (int) env('FIRECRAWL_CRAWL_LIMIT', 10), + ], + ], + ], + +]; +``` + +### 3. Add environment variables to `.env` + +```dotenv +FIRECRAWL_API_KEY=fc-your-key-here + +# Scrape defaults +FIRECRAWL_SCRAPE_FORMATS=markdown +FIRECRAWL_SCRAPE_ONLY_MAIN_CONTENT=true + +# Map defaults +FIRECRAWL_MAP_LIMIT=100 + +# Search defaults +FIRECRAWL_SEARCH_LIMIT=5 +FIRECRAWL_SEARCH_SOURCES=web + +# Crawl defaults +FIRECRAWL_CRAWL_LIMIT=10 +``` + +| Config key | Env var | Default | Description | +|---|---|---|---| +| `services.firecrawl.key` | `FIRECRAWL_API_KEY` | - | **Required.** Your Firecrawl API key, sent as a bearer token. | +| `ai.toolkit.firecrawl.scrape.formats` | `FIRECRAWL_SCRAPE_FORMATS` | `"markdown"` | Default scrape formats (comma-separated). | +| `ai.toolkit.firecrawl.scrape.only_main_content` | `FIRECRAWL_SCRAPE_ONLY_MAIN_CONTENT` | `true` | Strip navigation, headers and footers by default. | +| `ai.toolkit.firecrawl.map.limit` | `FIRECRAWL_MAP_LIMIT` | `100` | Default number of URLs to map (1-5000). | +| `ai.toolkit.firecrawl.search.limit` | `FIRECRAWL_SEARCH_LIMIT` | `5` | Default number of search results (1-20). | +| `ai.toolkit.firecrawl.search.sources` | `FIRECRAWL_SEARCH_SOURCES` | `"web"` | Default search sources (comma-separated). | +| `ai.toolkit.firecrawl.crawl.limit` | `FIRECRAWL_CRAWL_LIMIT` | `10` | Default number of pages to crawl (1-1000). | + +## Safety + +- All tools validate required inputs before calling the API. +- Numeric parameters are clamped to their valid ranges; `formats` and `sources` are filtered against an allow-list and fall back to a safe default. +- `FirecrawlCrawlStatus` accepts only a well-formed crawl id (`[A-Za-z0-9-]`), so a crawl id can never be used to reach another endpoint. +- API errors and network failures are caught and returned as friendly string messages so the model can recover. +- Requires a valid Firecrawl API key; tools return a clear "not configured" message when it is missing. + +## Firecrawl API + +These tools use the [Firecrawl v2 API](https://docs.firecrawl.dev). Firecrawl offers a free tier to get started. + +Full API reference: +- [Scrape Endpoint](https://docs.firecrawl.dev/api-reference/endpoint/scrape) +- [Map Endpoint](https://docs.firecrawl.dev/api-reference/endpoint/map) +- [Search Endpoint](https://docs.firecrawl.dev/api-reference/endpoint/search) +- [Crawl Endpoint](https://docs.firecrawl.dev/api-reference/endpoint/crawl-post) +- [Crawl Status Endpoint](https://docs.firecrawl.dev/api-reference/endpoint/crawl-get) diff --git a/src/Firecrawl/composer.json b/src/Firecrawl/composer.json new file mode 100644 index 0000000..6ad370e --- /dev/null +++ b/src/Firecrawl/composer.json @@ -0,0 +1,24 @@ +{ + "name": "shipfastlabs/toolkit-firecrawl", + "description": "Firecrawl tools for the Laravel AI SDK - Scrape, Map, Search, and Crawl", + "keywords": ["laravel", "ai", "tool", "firecrawl", "scrape", "crawl", "search", "map"], + "license": "MIT", + "require": { + "php": "^8.4.0", + "illuminate/contracts": "^12.0|^13.0", + "illuminate/support": "^12.0|^13.0", + "laravel/ai": "^0.7" + }, + "autoload": { + "psr-4": { + "Shipfastlabs\\Toolkit\\Firecrawl\\": "src/" + } + }, + "autoload-dev": { + "psr-4": { + "Shipfastlabs\\Toolkit\\Firecrawl\\Tests\\": "tests/" + } + }, + "minimum-stability": "dev", + "prefer-stable": true +} diff --git a/src/Firecrawl/src/Concerns/InteractsWithFirecrawl.php b/src/Firecrawl/src/Concerns/InteractsWithFirecrawl.php new file mode 100644 index 0000000..c55ab35 --- /dev/null +++ b/src/Firecrawl/src/Concerns/InteractsWithFirecrawl.php @@ -0,0 +1,123 @@ +connectTimeout(10) + ->timeout($timeout) + ->withToken($apiKey); + } + + /** + * @param array $payload + */ + private function firecrawlPost(string $apiKey, int $timeout, string $path, array $payload, string $label): string + { + try { + $response = $this->firecrawlClient($apiKey, $timeout)->post($path, $payload); + } catch (Throwable $throwable) { + return sprintf('The Firecrawl %s request failed: %s', $label, $throwable->getMessage()); + } + + return $this->firecrawlBody($response, $label); + } + + private function firecrawlGet(string $apiKey, int $timeout, string $path, string $label): string + { + try { + $response = $this->firecrawlClient($apiKey, $timeout)->get($path); + } catch (Throwable $throwable) { + return sprintf('The Firecrawl %s request failed: %s', $label, $throwable->getMessage()); + } + + return $this->firecrawlBody($response, $label); + } + + private function firecrawlBody(Response $response, string $label): string + { + if ($response->failed()) { + return sprintf( + 'The Firecrawl %s request failed with status %d: %s', + $label, + $response->status(), + $response->body() + ); + } + + $data = $response->json(); + + if (! is_array($data)) { + return sprintf('The Firecrawl %s response was invalid.', $label); + } + + return json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_THROW_ON_ERROR); + } + + private function firecrawlConfigString(string $key, string $default): string + { + $value = config($key, $default); + + return is_string($value) && $value !== '' ? $value : $default; + } + + private function firecrawlConfigInt(string $key, int $default): int + { + $value = config($key, $default); + + return is_numeric($value) ? (int) $value : $default; + } + + private function firecrawlConfigBool(string $key, bool $default): bool + { + $value = config($key, $default); + + return is_bool($value) ? $value : $default; + } + + private function firecrawlResolveLimit(Request $request, string $configKey, int $default, int $min, int $max): int + { + $limit = $request->filled('limit') + ? $request->integer('limit') + : $this->firecrawlConfigInt($configKey, $default); + + return max($min, min($max, $limit)); + } + + /** + * @param list $allowed + * @return list + */ + private function firecrawlCsvAllowList(string $raw, array $allowed, string $fallback): array + { + $values = array_values(array_filter( + array_map(trim(...), explode(',', $raw)), + static fn (string $value): bool => in_array($value, $allowed, true), + )); + + return $values === [] ? [$fallback] : $values; + } +} diff --git a/src/Firecrawl/src/Firecrawl.php b/src/Firecrawl/src/Firecrawl.php new file mode 100644 index 0000000..7308ab2 --- /dev/null +++ b/src/Firecrawl/src/Firecrawl.php @@ -0,0 +1,25 @@ + + */ + public static function all(): Collection + { + return new Collection([ + new FirecrawlScrape, + new FirecrawlMap, + new FirecrawlSearch, + new FirecrawlCrawl, + new FirecrawlCrawlStatus, + ]); + } +} diff --git a/src/Firecrawl/src/FirecrawlCrawl.php b/src/Firecrawl/src/FirecrawlCrawl.php new file mode 100644 index 0000000..04d9417 --- /dev/null +++ b/src/Firecrawl/src/FirecrawlCrawl.php @@ -0,0 +1,76 @@ + $schema + ->string() + ->description('The URL to start crawling from.') + ->required(), + 'limit' => $schema + ->integer() + ->description('Maximum number of pages to crawl (1-1000, default: 10).') + ->nullable() + ->required(), + 'prompt' => $schema + ->string() + ->description('Natural-language instructions to steer which pages are crawled, e.g. "only blog posts".') + ->nullable() + ->required(), + ]; + } + + public function handle(Request $request): string + { + $url = $request->string('url')->trim(); + + if ($url->isEmpty()) { + return 'The URL is empty. Provide a URL to crawl.'; + } + + $apiKey = $this->firecrawlApiKey(); + + if ($apiKey === null) { + return $this->firecrawlNotConfiguredMessage(); + } + + $payload = [ + 'url' => (string) $url, + 'limit' => $this->firecrawlResolveLimit($request, 'ai.toolkit.firecrawl.crawl.limit', 10, 1, 1000), + ]; + + $prompt = $request->string('prompt')->trim(); + + if ($prompt->isNotEmpty()) { + $payload['prompt'] = (string) $prompt; + } + + return $this->firecrawlPost($apiKey, 30, '/v2/crawl', $payload, 'crawl'); + } +} diff --git a/src/Firecrawl/src/FirecrawlCrawlStatus.php b/src/Firecrawl/src/FirecrawlCrawlStatus.php new file mode 100644 index 0000000..8917e70 --- /dev/null +++ b/src/Firecrawl/src/FirecrawlCrawlStatus.php @@ -0,0 +1,58 @@ + $schema + ->string() + ->description('The crawl id returned by the Firecrawl crawl tool.') + ->required(), + ]; + } + + public function handle(Request $request): string + { + $crawlId = $request->string('crawl_id')->trim(); + + if ($crawlId->isEmpty()) { + return 'The crawl id is empty. Provide the id returned by the Firecrawl crawl tool.'; + } + + if (preg_match('/^[A-Za-z0-9-]+$/', (string) $crawlId) !== 1) { + return 'The crawl id is invalid. It should look like the id returned by the Firecrawl crawl tool.'; + } + + $apiKey = $this->firecrawlApiKey(); + + if ($apiKey === null) { + return $this->firecrawlNotConfiguredMessage(); + } + + return $this->firecrawlGet($apiKey, 30, '/v2/crawl/'.$crawlId, 'crawl status'); + } +} diff --git a/src/Firecrawl/src/FirecrawlMap.php b/src/Firecrawl/src/FirecrawlMap.php new file mode 100644 index 0000000..c9d2ad7 --- /dev/null +++ b/src/Firecrawl/src/FirecrawlMap.php @@ -0,0 +1,74 @@ + $schema + ->string() + ->description('The base URL of the website to map.') + ->required(), + 'search' => $schema + ->string() + ->description('Only return URLs that match this search term (default: no filter).') + ->nullable() + ->required(), + 'limit' => $schema + ->integer() + ->description('Maximum number of URLs to return (1-5000, default: 100).') + ->nullable() + ->required(), + ]; + } + + public function handle(Request $request): string + { + $url = $request->string('url')->trim(); + + if ($url->isEmpty()) { + return 'The URL is empty. Provide a URL to map.'; + } + + $apiKey = $this->firecrawlApiKey(); + + if ($apiKey === null) { + return $this->firecrawlNotConfiguredMessage(); + } + + $payload = [ + 'url' => (string) $url, + 'limit' => $this->firecrawlResolveLimit($request, 'ai.toolkit.firecrawl.map.limit', 100, 1, 5000), + ]; + + $search = $request->string('search')->trim(); + + if ($search->isNotEmpty()) { + $payload['search'] = (string) $search; + } + + return $this->firecrawlPost($apiKey, 60, '/v2/map', $payload, 'map'); + } +} diff --git a/src/Firecrawl/src/FirecrawlScrape.php b/src/Firecrawl/src/FirecrawlScrape.php new file mode 100644 index 0000000..e71303b --- /dev/null +++ b/src/Firecrawl/src/FirecrawlScrape.php @@ -0,0 +1,96 @@ + + */ + private const array FORMATS = ['markdown', 'html', 'rawHtml', 'links', 'summary']; + + public function description(): string + { + return <<<'TXT' + Scrape a single web page and return its content as clean, LLM-ready markdown. + Use this to read one specific URL. It renders JavaScript, so it works on dynamic + pages that a plain HTTP fetch cannot read. Optionally request other formats such + as html, rawHtml, links, or a summary. + TXT; + } + + public function schema(JsonSchema $schema): array + { + return [ + 'url' => $schema + ->string() + ->description('The URL of the web page to scrape.') + ->required(), + 'formats' => $schema + ->string() + ->description("Comma-separated output formats, any of 'markdown', 'html', 'rawHtml', 'links', 'summary' (default: 'markdown').") + ->nullable() + ->required(), + 'only_main_content' => $schema + ->boolean() + ->description('Return only the main content, stripping navigation, headers and footers (default: true).') + ->nullable() + ->required(), + ]; + } + + public function handle(Request $request): string + { + $url = $request->string('url')->trim(); + + if ($url->isEmpty()) { + return 'The URL is empty. Provide a URL to scrape.'; + } + + $apiKey = $this->firecrawlApiKey(); + + if ($apiKey === null) { + return $this->firecrawlNotConfiguredMessage(); + } + + $payload = [ + 'url' => (string) $url, + 'formats' => $this->resolveFormats($request), + 'onlyMainContent' => $this->resolveOnlyMainContent($request), + ]; + + return $this->firecrawlPost($apiKey, 60, '/v2/scrape', $payload, 'scrape'); + } + + /** + * @return list + */ + private function resolveFormats(Request $request): array + { + $raw = $request->filled('formats') + ? (string) $request->string('formats') + : $this->firecrawlConfigString('ai.toolkit.firecrawl.scrape.formats', 'markdown'); + + return $this->firecrawlCsvAllowList($raw, self::FORMATS, 'markdown'); + } + + private function resolveOnlyMainContent(Request $request): bool + { + if ($request->has('only_main_content') && $request['only_main_content'] !== null) { + return $request->boolean('only_main_content'); + } + + return $this->firecrawlConfigBool('ai.toolkit.firecrawl.scrape.only_main_content', true); + } +} diff --git a/src/Firecrawl/src/FirecrawlSearch.php b/src/Firecrawl/src/FirecrawlSearch.php new file mode 100644 index 0000000..5b096a7 --- /dev/null +++ b/src/Firecrawl/src/FirecrawlSearch.php @@ -0,0 +1,108 @@ + + */ + private const array SOURCES = ['web', 'news', 'images']; + + public function description(): string + { + return <<<'TXT' + Search the web and return ranked results. Use this to find pages when you do not + already have a URL. Optionally pull results from the news or images sources, and + optionally scrape each result into markdown so you get the page content, not just + the link. + TXT; + } + + public function schema(JsonSchema $schema): array + { + return [ + 'query' => $schema + ->string() + ->description('The search query to look up on the web.') + ->required(), + 'limit' => $schema + ->integer() + ->description('Maximum number of results to return (1-20, default: 5).') + ->nullable() + ->required(), + 'sources' => $schema + ->string() + ->description("Comma-separated result sources, any of 'web', 'news', 'images' (default: 'web').") + ->nullable() + ->required(), + 'scrape_content' => $schema + ->boolean() + ->description('Scrape each result and include its markdown content, not just the link (default: false).') + ->nullable() + ->required(), + ]; + } + + public function handle(Request $request): string + { + $query = $request->string('query')->trim(); + + if ($query->isEmpty()) { + return 'The search query is empty. Provide a query to search for.'; + } + + $apiKey = $this->firecrawlApiKey(); + + if ($apiKey === null) { + return $this->firecrawlNotConfiguredMessage(); + } + + $payload = [ + 'query' => (string) $query, + 'limit' => $this->firecrawlResolveLimit($request, 'ai.toolkit.firecrawl.search.limit', 5, 1, 20), + 'sources' => $this->resolveSources($request), + ]; + + if ($this->resolveScrapeContent($request)) { + $payload['scrapeOptions'] = [ + 'formats' => ['markdown'], + 'onlyMainContent' => true, + ]; + } + + return $this->firecrawlPost($apiKey, 120, '/v2/search', $payload, 'search'); + } + + /** + * @return list + */ + private function resolveSources(Request $request): array + { + $raw = $request->filled('sources') + ? (string) $request->string('sources') + : $this->firecrawlConfigString('ai.toolkit.firecrawl.search.sources', 'web'); + + return $this->firecrawlCsvAllowList($raw, self::SOURCES, 'web'); + } + + private function resolveScrapeContent(Request $request): bool + { + if ($request->has('scrape_content') && $request['scrape_content'] !== null) { + return $request->boolean('scrape_content'); + } + + return false; + } +} diff --git a/src/Firecrawl/tests/FirecrawlCrawlStatusTest.php b/src/Firecrawl/tests/FirecrawlCrawlStatusTest.php new file mode 100644 index 0000000..37aeb5d --- /dev/null +++ b/src/Firecrawl/tests/FirecrawlCrawlStatusTest.php @@ -0,0 +1,88 @@ +set('services.firecrawl.key', 'test-key'); + + Http::preventStrayRequests(); +}); + +it('has a description', function (): void { + expect((new FirecrawlCrawlStatus)->description())->toContain('crawl'); +}); + +it('is marked as strict', function (): void { + expect(Strict::isAppliedTo(new FirecrawlCrawlStatus))->toBeTrue(); +}); + +it('exposes its schema', function (): void { + $schema = (new FirecrawlCrawlStatus)->schema(new JsonSchemaTypeFactory); + + expect($schema)->toHaveKey('crawl_id'); +}); + +it('returns an error when the crawl id is empty', function (): void { + $result = (new FirecrawlCrawlStatus)->handle(new Request(['crawl_id' => ' '])); + + expect($result)->toContain('empty'); +}); + +it('returns an error when the crawl id is malformed', function (): void { + $result = (new FirecrawlCrawlStatus)->handle(new Request(['crawl_id' => '../secrets'])); + + expect($result)->toContain('invalid'); +}); + +it('returns an error when no api key is configured', function (): void { + config()->set('services.firecrawl.key'); + + $result = (new FirecrawlCrawlStatus)->handle(new Request(['crawl_id' => 'abc-123'])); + + expect($result)->toContain('not configured'); +}); + +it('returns the crawl status on success', function (): void { + Http::fake([ + 'https://api.firecrawl.dev/v2/crawl/abc-123' => Http::response([ + 'success' => true, + 'status' => 'completed', + 'completed' => 1, + 'total' => 1, + 'data' => [['markdown' => '# Page']], + ]), + ]); + + $result = (new FirecrawlCrawlStatus)->handle(new Request(['crawl_id' => 'abc-123'])); + + expect($result)->toContain('completed') + ->and($result)->toContain('# Page'); +}); + +it('requests the status endpoint for the given crawl id', function (): void { + Http::fake(function ($request) { + expect($request->url())->toBe('https://api.firecrawl.dev/v2/crawl/abc-123') + ->and($request->method())->toBe('GET'); + + return Http::response(['success' => true, 'status' => 'scraping']); + }); + + (new FirecrawlCrawlStatus)->handle(new Request(['crawl_id' => 'abc-123'])); +}); + +it('returns a friendly error when the request throws', function (): void { + Http::fake(function (): void { + throw new RuntimeException('Connection timed out'); + }); + + $result = (new FirecrawlCrawlStatus)->handle(new Request(['crawl_id' => 'abc-123'])); + + expect($result)->toContain('request failed') + ->and($result)->toContain('Connection timed out'); +}); diff --git a/src/Firecrawl/tests/FirecrawlCrawlTest.php b/src/Firecrawl/tests/FirecrawlCrawlTest.php new file mode 100644 index 0000000..26d3774 --- /dev/null +++ b/src/Firecrawl/tests/FirecrawlCrawlTest.php @@ -0,0 +1,116 @@ +set('services.firecrawl.key', 'test-key'); + + Http::preventStrayRequests(); +}); + +it('has a description that mentions it is asynchronous', function (): void { + expect((new FirecrawlCrawl)->description())->toContain('asynchronously'); +}); + +it('is marked as strict', function (): void { + expect(Strict::isAppliedTo(new FirecrawlCrawl))->toBeTrue(); +}); + +it('exposes its schema', function (): void { + $schema = (new FirecrawlCrawl)->schema(new JsonSchemaTypeFactory); + + expect($schema)->toHaveKey('url') + ->and($schema)->toHaveKey('limit') + ->and($schema)->toHaveKey('prompt'); +}); + +it('returns an error when the url is empty', function (): void { + $result = (new FirecrawlCrawl)->handle(new Request(['url' => ' '])); + + expect($result)->toContain('empty'); +}); + +it('returns an error when no api key is configured', function (): void { + config()->set('services.firecrawl.key'); + + $result = (new FirecrawlCrawl)->handle(new Request(['url' => 'https://example.com'])); + + expect($result)->toContain('not configured'); +}); + +it('returns the crawl id on success', function (): void { + Http::fake([ + 'https://api.firecrawl.dev/v2/crawl' => Http::response([ + 'success' => true, + 'id' => 'abc-123', + 'url' => 'https://api.firecrawl.dev/v2/crawl/abc-123', + ]), + ]); + + $result = (new FirecrawlCrawl)->handle(new Request(['url' => 'https://example.com'])); + + expect($result)->toContain('abc-123'); +}); + +it('defaults the limit to 10 when omitted', function (): void { + Http::fake(function ($request) { + expect($request->data())->toHaveKey('limit', 10); + + return Http::response(['success' => true, 'id' => 'abc']); + }); + + (new FirecrawlCrawl)->handle(new Request(['url' => 'https://example.com'])); +}); + +it('uses the configured default limit when omitted', function (): void { + config()->set('ai.toolkit.firecrawl.crawl.limit', 25); + + Http::fake(function ($request) { + expect($request->data())->toHaveKey('limit', 25); + + return Http::response(['success' => true, 'id' => 'abc']); + }); + + (new FirecrawlCrawl)->handle(new Request(['url' => 'https://example.com'])); +}); + +it('clamps the limit between 1 and 1000', function (int $input, int $expected): void { + Http::fake(function ($request) use ($expected) { + expect($request->data())->toHaveKey('limit', $expected); + + return Http::response(['success' => true, 'id' => 'abc']); + }); + + (new FirecrawlCrawl)->handle(new Request(['url' => 'https://example.com', 'limit' => $input])); +})->with([ + 'too low' => [0, 1], + 'minimum' => [1, 1], + 'maximum' => [1000, 1000], + 'too high' => [5000, 1000], +]); + +it('sends a prompt when provided', function (): void { + Http::fake(function ($request) { + expect($request->data())->toHaveKey('prompt', 'only blog posts'); + + return Http::response(['success' => true, 'id' => 'abc']); + }); + + (new FirecrawlCrawl)->handle(new Request(['url' => 'https://example.com', 'prompt' => ' only blog posts '])); +}); + +it('omits the prompt when not provided', function (): void { + Http::fake(function ($request) { + expect($request->data())->not->toHaveKey('prompt'); + + return Http::response(['success' => true, 'id' => 'abc']); + }); + + (new FirecrawlCrawl)->handle(new Request(['url' => 'https://example.com'])); +}); diff --git a/src/Firecrawl/tests/FirecrawlMapTest.php b/src/Firecrawl/tests/FirecrawlMapTest.php new file mode 100644 index 0000000..3a068fc --- /dev/null +++ b/src/Firecrawl/tests/FirecrawlMapTest.php @@ -0,0 +1,128 @@ +set('services.firecrawl.key', 'test-key'); + + Http::preventStrayRequests(); +}); + +it('has a description', function (): void { + expect((new FirecrawlMap)->description())->toContain('Map'); +}); + +it('is marked as strict', function (): void { + expect(Strict::isAppliedTo(new FirecrawlMap))->toBeTrue(); +}); + +it('exposes its schema', function (): void { + $schema = (new FirecrawlMap)->schema(new JsonSchemaTypeFactory); + + expect($schema)->toHaveKey('url') + ->and($schema)->toHaveKey('search') + ->and($schema)->toHaveKey('limit'); +}); + +it('returns an error when the url is empty', function (): void { + $result = (new FirecrawlMap)->handle(new Request(['url' => ' '])); + + expect($result)->toContain('empty'); +}); + +it('returns an error when no api key is configured', function (): void { + config()->set('services.firecrawl.key'); + + $result = (new FirecrawlMap)->handle(new Request(['url' => 'https://example.com'])); + + expect($result)->toContain('not configured'); +}); + +it('returns the mapped links on success', function (): void { + Http::fake([ + 'https://api.firecrawl.dev/v2/map' => Http::response([ + 'success' => true, + 'links' => [['url' => 'https://example.com/', 'title' => 'Example']], + ]), + ]); + + $result = (new FirecrawlMap)->handle(new Request(['url' => 'https://example.com'])); + + expect($result)->toContain('https://example.com/') + ->and($result)->toContain('Example'); +}); + +it('defaults the limit to 100 when omitted', function (): void { + Http::fake(function ($request) { + expect($request->data())->toHaveKey('limit', 100); + + return Http::response(['success' => true, 'links' => []]); + }); + + (new FirecrawlMap)->handle(new Request(['url' => 'https://example.com'])); +}); + +it('uses the configured default limit when omitted', function (): void { + config()->set('ai.toolkit.firecrawl.map.limit', 50); + + Http::fake(function ($request) { + expect($request->data())->toHaveKey('limit', 50); + + return Http::response(['success' => true, 'links' => []]); + }); + + (new FirecrawlMap)->handle(new Request(['url' => 'https://example.com'])); +}); + +it('falls back to 100 when the configured limit is not numeric', function (): void { + config()->set('ai.toolkit.firecrawl.map.limit', 'lots'); + + Http::fake(function ($request) { + expect($request->data())->toHaveKey('limit', 100); + + return Http::response(['success' => true, 'links' => []]); + }); + + (new FirecrawlMap)->handle(new Request(['url' => 'https://example.com'])); +}); + +it('clamps the limit between 1 and 5000', function (int $input, int $expected): void { + Http::fake(function ($request) use ($expected) { + expect($request->data())->toHaveKey('limit', $expected); + + return Http::response(['success' => true, 'links' => []]); + }); + + (new FirecrawlMap)->handle(new Request(['url' => 'https://example.com', 'limit' => $input])); +})->with([ + 'too low' => [0, 1], + 'minimum' => [1, 1], + 'maximum' => [5000, 5000], + 'too high' => [9999, 5000], +]); + +it('sends a search filter when provided', function (): void { + Http::fake(function ($request) { + expect($request->data())->toHaveKey('search', 'blog'); + + return Http::response(['success' => true, 'links' => []]); + }); + + (new FirecrawlMap)->handle(new Request(['url' => 'https://example.com', 'search' => ' blog '])); +}); + +it('omits the search filter when not provided', function (): void { + Http::fake(function ($request) { + expect($request->data())->not->toHaveKey('search'); + + return Http::response(['success' => true, 'links' => []]); + }); + + (new FirecrawlMap)->handle(new Request(['url' => 'https://example.com'])); +}); diff --git a/src/Firecrawl/tests/FirecrawlScrapeTest.php b/src/Firecrawl/tests/FirecrawlScrapeTest.php new file mode 100644 index 0000000..9b8b4c0 --- /dev/null +++ b/src/Firecrawl/tests/FirecrawlScrapeTest.php @@ -0,0 +1,194 @@ +set('services.firecrawl.key', 'test-key'); + + Http::preventStrayRequests(); +}); + +it('has a description', function (): void { + expect((new FirecrawlScrape)->description())->toContain('Scrape'); +}); + +it('is marked as strict', function (): void { + expect(Strict::isAppliedTo(new FirecrawlScrape))->toBeTrue(); +}); + +it('exposes its schema', function (): void { + $schema = (new FirecrawlScrape)->schema(new JsonSchemaTypeFactory); + + expect($schema)->toHaveKey('url') + ->and($schema)->toHaveKey('formats') + ->and($schema)->toHaveKey('only_main_content'); +}); + +it('returns an error when the url is empty', function (): void { + $result = (new FirecrawlScrape)->handle(new Request(['url' => ' '])); + + expect($result)->toContain('empty'); +}); + +it('returns an error when no api key is configured', function (): void { + config()->set('services.firecrawl.key'); + + $result = (new FirecrawlScrape)->handle(new Request(['url' => 'https://example.com'])); + + expect($result)->toContain('not configured'); +}); + +it('returns the scraped content on success', function (): void { + Http::fake([ + 'https://api.firecrawl.dev/v2/scrape' => Http::response(['success' => true, 'data' => ['markdown' => '# Example']]), + ]); + + $result = (new FirecrawlScrape)->handle(new Request(['url' => 'https://example.com'])); + + expect($result)->toContain('# Example'); +}); + +it('sends the api key as a bearer token', function (): void { + Http::fake(function ($request) { + expect($request->hasHeader('Authorization', 'Bearer test-key'))->toBeTrue(); + + return Http::response(['success' => true, 'data' => []]); + }); + + (new FirecrawlScrape)->handle(new Request(['url' => 'https://example.com'])); +}); + +it('returns an error when the api responds with a failure', function (): void { + Http::fake([ + 'https://api.firecrawl.dev/v2/scrape' => Http::response('Unauthorized', 401), + ]); + + $result = (new FirecrawlScrape)->handle(new Request(['url' => 'https://example.com'])); + + expect($result)->toContain('failed with status 401'); +}); + +it('returns an error when the response is not an array', function (): void { + Http::fake([ + 'https://api.firecrawl.dev/v2/scrape' => Http::response('"plain string"', 200, ['Content-Type' => 'application/json']), + ]); + + $result = (new FirecrawlScrape)->handle(new Request(['url' => 'https://example.com'])); + + expect($result)->toContain('invalid'); +}); + +it('returns a friendly error when the request throws', function (): void { + Http::fake(function (): void { + throw new RuntimeException('Connection timed out'); + }); + + $result = (new FirecrawlScrape)->handle(new Request(['url' => 'https://example.com'])); + + expect($result)->toContain('request failed') + ->and($result)->toContain('Connection timed out'); +}); + +it('defaults to markdown when formats are omitted', function (): void { + Http::fake(function ($request) { + expect($request->data())->toHaveKey('formats', ['markdown']); + + return Http::response(['success' => true, 'data' => []]); + }); + + (new FirecrawlScrape)->handle(new Request(['url' => 'https://example.com'])); +}); + +it('parses comma-separated formats', function (): void { + Http::fake(function ($request) { + expect($request->data())->toHaveKey('formats', ['markdown', 'links']); + + return Http::response(['success' => true, 'data' => []]); + }); + + (new FirecrawlScrape)->handle(new Request(['url' => 'https://example.com', 'formats' => 'markdown, links'])); +}); + +it('drops invalid formats and falls back to markdown', function (): void { + Http::fake(function ($request) { + expect($request->data())->toHaveKey('formats', ['markdown']); + + return Http::response(['success' => true, 'data' => []]); + }); + + (new FirecrawlScrape)->handle(new Request(['url' => 'https://example.com', 'formats' => 'bogus, nope'])); +}); + +it('uses the configured default formats when omitted', function (): void { + config()->set('ai.toolkit.firecrawl.scrape.formats', 'html,links'); + + Http::fake(function ($request) { + expect($request->data())->toHaveKey('formats', ['html', 'links']); + + return Http::response(['success' => true, 'data' => []]); + }); + + (new FirecrawlScrape)->handle(new Request(['url' => 'https://example.com'])); +}); + +it('falls back to markdown when the configured formats are not a string', function (): void { + config()->set('ai.toolkit.firecrawl.scrape.formats', 123); + + Http::fake(function ($request) { + expect($request->data())->toHaveKey('formats', ['markdown']); + + return Http::response(['success' => true, 'data' => []]); + }); + + (new FirecrawlScrape)->handle(new Request(['url' => 'https://example.com'])); +}); + +it('defaults only_main_content to true', function (): void { + Http::fake(function ($request) { + expect($request->data())->toHaveKey('onlyMainContent', true); + + return Http::response(['success' => true, 'data' => []]); + }); + + (new FirecrawlScrape)->handle(new Request(['url' => 'https://example.com'])); +}); + +it('respects an explicit only_main_content of false', function (): void { + Http::fake(function ($request) { + expect($request->data())->toHaveKey('onlyMainContent', false); + + return Http::response(['success' => true, 'data' => []]); + }); + + (new FirecrawlScrape)->handle(new Request(['url' => 'https://example.com', 'only_main_content' => false])); +}); + +it('uses the configured default only_main_content when omitted', function (): void { + config()->set('ai.toolkit.firecrawl.scrape.only_main_content', false); + + Http::fake(function ($request) { + expect($request->data())->toHaveKey('onlyMainContent', false); + + return Http::response(['success' => true, 'data' => []]); + }); + + (new FirecrawlScrape)->handle(new Request(['url' => 'https://example.com'])); +}); + +it('falls back to true when the configured only_main_content is not a boolean', function (): void { + config()->set('ai.toolkit.firecrawl.scrape.only_main_content', 'yes'); + + Http::fake(function ($request) { + expect($request->data())->toHaveKey('onlyMainContent', true); + + return Http::response(['success' => true, 'data' => []]); + }); + + (new FirecrawlScrape)->handle(new Request(['url' => 'https://example.com'])); +}); diff --git a/src/Firecrawl/tests/FirecrawlSearchTest.php b/src/Firecrawl/tests/FirecrawlSearchTest.php new file mode 100644 index 0000000..fc96f96 --- /dev/null +++ b/src/Firecrawl/tests/FirecrawlSearchTest.php @@ -0,0 +1,153 @@ +set('services.firecrawl.key', 'test-key'); + + Http::preventStrayRequests(); +}); + +it('has a description', function (): void { + expect((new FirecrawlSearch)->description())->toContain('Search the web'); +}); + +it('is marked as strict', function (): void { + expect(Strict::isAppliedTo(new FirecrawlSearch))->toBeTrue(); +}); + +it('exposes its schema', function (): void { + $schema = (new FirecrawlSearch)->schema(new JsonSchemaTypeFactory); + + expect($schema)->toHaveKey('query') + ->and($schema)->toHaveKey('limit') + ->and($schema)->toHaveKey('sources') + ->and($schema)->toHaveKey('scrape_content'); +}); + +it('returns an error when the query is empty', function (): void { + $result = (new FirecrawlSearch)->handle(new Request(['query' => ' '])); + + expect($result)->toContain('empty'); +}); + +it('returns an error when no api key is configured', function (): void { + config()->set('services.firecrawl.key'); + + $result = (new FirecrawlSearch)->handle(new Request(['query' => 'laravel'])); + + expect($result)->toContain('not configured'); +}); + +it('returns the search results on success', function (): void { + Http::fake([ + 'https://api.firecrawl.dev/v2/search' => Http::response([ + 'success' => true, + 'data' => ['web' => [['url' => 'https://laravel.com', 'title' => 'Laravel']]], + ]), + ]); + + $result = (new FirecrawlSearch)->handle(new Request(['query' => 'laravel'])); + + expect($result)->toContain('https://laravel.com') + ->and($result)->toContain('Laravel'); +}); + +it('defaults the limit to 5 and the source to web', function (): void { + Http::fake(function ($request) { + expect($request->data()) + ->toHaveKey('limit', 5) + ->toHaveKey('sources', ['web']) + ->not->toHaveKey('scrapeOptions'); + + return Http::response(['success' => true, 'data' => []]); + }); + + (new FirecrawlSearch)->handle(new Request(['query' => 'laravel'])); +}); + +it('uses the configured default limit when omitted', function (): void { + config()->set('ai.toolkit.firecrawl.search.limit', 8); + + Http::fake(function ($request) { + expect($request->data())->toHaveKey('limit', 8); + + return Http::response(['success' => true, 'data' => []]); + }); + + (new FirecrawlSearch)->handle(new Request(['query' => 'laravel'])); +}); + +it('clamps the limit between 1 and 20', function (int $input, int $expected): void { + Http::fake(function ($request) use ($expected) { + expect($request->data())->toHaveKey('limit', $expected); + + return Http::response(['success' => true, 'data' => []]); + }); + + (new FirecrawlSearch)->handle(new Request(['query' => 'laravel', 'limit' => $input])); +})->with([ + 'too low' => [0, 1], + 'minimum' => [1, 1], + 'maximum' => [20, 20], + 'too high' => [99, 20], +]); + +it('parses comma-separated sources', function (): void { + Http::fake(function ($request) { + expect($request->data())->toHaveKey('sources', ['web', 'news']); + + return Http::response(['success' => true, 'data' => []]); + }); + + (new FirecrawlSearch)->handle(new Request(['query' => 'laravel', 'sources' => 'web, news'])); +}); + +it('drops invalid sources and falls back to web', function (): void { + Http::fake(function ($request) { + expect($request->data())->toHaveKey('sources', ['web']); + + return Http::response(['success' => true, 'data' => []]); + }); + + (new FirecrawlSearch)->handle(new Request(['query' => 'laravel', 'sources' => 'bogus'])); +}); + +it('uses the configured default sources when omitted', function (): void { + config()->set('ai.toolkit.firecrawl.search.sources', 'news'); + + Http::fake(function ($request) { + expect($request->data())->toHaveKey('sources', ['news']); + + return Http::response(['success' => true, 'data' => []]); + }); + + (new FirecrawlSearch)->handle(new Request(['query' => 'laravel'])); +}); + +it('adds scrape options when scrape_content is true', function (): void { + Http::fake(function ($request) { + expect($request->data()) + ->toHaveKey('scrapeOptions', ['formats' => ['markdown'], 'onlyMainContent' => true]); + + return Http::response(['success' => true, 'data' => []]); + }); + + (new FirecrawlSearch)->handle(new Request(['query' => 'laravel', 'scrape_content' => true])); +}); + +it('omits scrape options when scrape_content is false', function (): void { + Http::fake(function ($request) { + expect($request->data())->not->toHaveKey('scrapeOptions'); + + return Http::response(['success' => true, 'data' => []]); + }); + + (new FirecrawlSearch)->handle(new Request(['query' => 'laravel', 'scrape_content' => false])); +}); diff --git a/src/Firecrawl/tests/FirecrawlTest.php b/src/Firecrawl/tests/FirecrawlTest.php new file mode 100644 index 0000000..8e288b4 --- /dev/null +++ b/src/Firecrawl/tests/FirecrawlTest.php @@ -0,0 +1,13 @@ +toHaveCount(5) + ->and($tools->every(fn (Tool $tool): bool => $tool instanceof Tool))->toBeTrue(); +});