diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
new file mode 100644
index 0000000..ce1853b
--- /dev/null
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,27 @@
+name: Publish to npm
+
+on:
+ release:
+ types: [published]
+
+jobs:
+ publish:
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+
+ - name: Set up Node.js
+ uses: actions/setup-node@v4
+ with:
+ node-version: 22
+ registry-url: https://registry.npmjs.org
+
+ - name: Install dependencies
+ run: npm ci
+
+ - name: Publish to npm
+ run: npm publish --access public
+ env:
+ NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
diff --git a/AGENTS.md b/AGENTS.md
index 2ccd167..c747e21 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1,274 +1,82 @@
-# AGENTS.md — AI Agent Guide for mcp-selenium
+# AGENTS.md
-> This file helps AI agents (and humans) quickly understand, navigate, and safely
-> contribute to this project.
-
-## Project Overview
-
-**mcp-selenium** is a [Model Context Protocol (MCP)](https://modelcontextprotocol.io)
-server that exposes browser automation capabilities via
-[Selenium WebDriver](https://www.selenium.dev/documentation/webdriver/). It allows
-LLM-powered applications (Goose, Claude Code, Cursor, etc.) to launch browsers, navigate
-pages, interact with elements, take screenshots, and more - all through the standardized
-MCP tool interface.
-
-- **Language:** JavaScript (ES Modules)
-- **Runtime:** Node.js
-- **Transport:** stdio (JSON-RPC 2.0 over stdin/stdout)
-- **MCP SDK:** `@modelcontextprotocol/sdk` ^1.7.0
-
----
+MCP server for Selenium WebDriver browser automation. JavaScript (ES Modules), Node.js, stdio transport (JSON-RPC 2.0).
## File Map
+```text
+src/lib/server.js ← ALL server logic: tool definitions, state, helpers, cleanup
+src/index.js ← Thin CLI wrapper, spawns server.js as child process
+test/mcp-client.mjs ← Reusable MCP test client (JSON-RPC over stdio)
+test/*.test.mjs ← Tests grouped by feature
+test/fixtures/*.html ← HTML files loaded via file:// URLs in tests
```
-mcp-selenium/
-├── AGENTS.md ← You are here
-├── README.md ← User-facing docs: installation, usage, tool reference
-├── package.json ← Dependencies, scripts, npm metadata
-├── smithery.yaml ← Smithery deployment config (stdio start command)
-├── src/
-│ ├── index.js ← CLI entry point: spawns server.js as child process
-│ └── lib/
-│ └── server.js ← ⭐ ALL server logic lives here
-└── test/
- ├── mcp-client.mjs ← Reusable MCP client for tests (JSON-RPC over stdio)
- ├── server.test.mjs ← Server init, tool registration, schemas
- ├── browser.test.mjs ← start_browser, close_session, take_screenshot, multi-session
- ├── navigation.test.mjs ← navigate, all 6 locator strategies
- ├── interactions.test.mjs ← click, send_keys, get_element_text, hover, double_click, right_click, press_key, drag_and_drop, upload_file
- ├── bidi.test.mjs ← BiDi enablement, console/error/network capture, session isolation
- └── fixtures/ ← HTML files loaded via file:// URLs
- ├── locators.html
- ├── interactions.html
- ├── mouse-actions.html
- ├── drag-drop.html
- ├── upload.html
- └── bidi.html
-```
-
-### Key Files in Detail
-
-| File | Purpose | When to Edit |
-|------|---------|--------------|
-| `src/lib/server.js` | MCP server: tool definitions, resource definitions, Selenium driver management, cleanup handlers | Adding/modifying tools, fixing MCP compliance, changing browser behavior |
-| `src/index.js` | Thin CLI wrapper that spawns `server.js` as a child process with signal forwarding | Only if changing how the process is launched |
-| `test/mcp-client.mjs` | Reusable MCP client that spawns the server, handles handshake, provides `callTool()` / `listTools()` / `fixture()` helpers | When changing test infrastructure |
-| `test/fixtures/` | Purpose-built HTML files for tests, one per test category | When a test needs elements not in existing fixtures |
-| `package.json` | npm metadata, dependency versions, `"bin"` entry for `mcp-selenium` CLI | Bumping versions, adding dependencies |
-| `smithery.yaml` | Declares how Smithery should start the server (`node src/lib/server.js`) | Only if changing the start command |
-| `README.md` | User docs: installation, client config examples, tool reference table | When adding/removing/changing tools |
-
----
## Architecture
-### Server Initialization (`server.js`)
-
-```
-McpServer (name: "MCP Selenium", version: "1.0.0")
- ↓
-Registers tools via server.tool(name, description, zodSchema, handler)
- ↓
-Registers resource via server.resource(name, ResourceTemplate, handler)
- ↓
-Connects to StdioServerTransport
- ↓
-Listens on stdin/stdout (JSON-RPC 2.0)
-```
-
-### State Management
-
-All browser state is held in a module-level `state` object:
+**Single-file server** — everything is in `server.js`. 18 tools, 1 resource.
+State is a module-level object:
```js
const state = {
drivers: new Map(), // sessionId → WebDriver instance
- currentSession: null, // string | null — the active session ID
+ currentSession: null, // active session ID
bidi: new Map() // sessionId → { available, consoleLogs, pageErrors, networkLogs }
};
```
-- **Session IDs** are formatted as `{browser}_{Date.now()}` (e.g., `chrome_1708531200000`)
-- Only one session is "current" at a time (set by `start_browser`, cleared by `close_session`)
-- Multiple sessions can exist in the `drivers` Map, but tools always operate on `currentSession`
-- **BiDi state** is a single Map of per-session objects — cleanup is one `state.bidi.delete(sessionId)` call
-
-### Helper Functions
-
-| Function | Purpose |
-|----------|---------|
-| `getDriver()` | Returns the WebDriver for `state.currentSession`. Throws if no active session. |
-| `getLocator(by, value)` | Converts a locator strategy string (`"id"`, `"css"`, `"xpath"`, `"name"`, `"tag"`, `"class"`) to a Selenium `By` object. |
-| `newBidiState()` | Returns a fresh `{ available, consoleLogs, pageErrors, networkLogs }` object for a new session. |
-| `setupBidi(driver, sessionId)` | Wires up BiDi event listeners (console, JS errors, network) for a session. Called from `start_browser`. |
-| `registerBidiTool(name, description, logKey, emptyMessage, unavailableMessage)` | Factory that registers a diagnostic tool. All three BiDi tools (`get_console_logs`, `get_page_errors`, `get_network_logs`) use this — don't copy-paste a new handler, call this instead. |
-
-### Diagnostics (WebDriver BiDi)
-
-The server automatically enables [WebDriver BiDi](https://w3c.github.io/webdriver-bidi/) when starting a browser session. BiDi provides real-time, passive capture of browser diagnostics — console messages, JavaScript errors, and network activity are collected in the background without any extra configuration.
-
-This is especially useful for AI agents: when something goes wrong on a page, the agent can check `get_console_logs` and `get_page_errors` to understand *why*, rather than relying solely on screenshots.
-
-- **Automatic**: BiDi is enabled by default when the browser supports it
-- **Graceful fallback**: If the browser or driver doesn't support BiDi, the session starts normally and the diagnostic tools return a helpful message
-- **No performance impact**: Logs are passively captured via event listeners — no polling or extra requests
-- **Per-session**: Each browser session has its own log buffers, cleaned up automatically on session close
-- **BiDi modules are dynamically imported** at the top of `server.js` — if the selenium-webdriver version doesn't include them, `LogInspector` and `Network` are set to `null` and all BiDi code is skipped
-
-### Cleanup
-
-- `SIGTERM` and `SIGINT` handlers call `cleanup()`, which quits all drivers and exits
-- `src/index.js` forwards these signals to the child process
-
----
-
-## Development Guide
-
-### Prerequisites
-
-- **Node.js** (check `package.json` for engine requirements)
-- **A browser + matching WebDriver** on PATH:
- - Chrome → `chromedriver`
- - Firefox → `geckodriver`
- - Edge → `msedgedriver`
-
-### Setup
-
-```bash
-npm install
-```
-
-### Running Locally
+Related operations are consolidated into single tools with `action` enum parameters (`interact`, `window`, `frame`, `alert`, `diagnostics`). This is intentional — it reduces context window token cost for LLM consumers.
-```bash
-# Direct execution (for testing)
-node src/lib/server.js
+BiDi (WebDriver BiDi) is auto-enabled on `start_browser` for passive capture of console logs, JS errors, and network activity. Modules are dynamically imported — if unavailable, BiDi is silently skipped.
-# Via the CLI entry point
-node src/index.js
+## Conventions
-# Via npm (uses the "bin" field)
-npx mcp-selenium
-```
+- **ES Modules** — `import`/`export`, not `require`.
+- **Zod schemas** — tool inputs defined with Zod, auto-converted to JSON Schema by MCP SDK.
+- **Error pattern** — every handler: `try/catch`, return `{ content: [...], isError: true }` on failure.
+- **No `console.log()`** — stdio transport. Use `console.error()` for debug output.
+- **`send_keys` clears first** — calls `element.clear()` before typing. Intentional.
+- **MCP compliance** — before modifying server behavior, read the [MCP spec](https://modelcontextprotocol.io/specification/2025-11-25). Don't violate it.
-The server communicates over **stdin/stdout** — it will appear to hang because it's
-waiting for JSON-RPC input. Use an MCP client (Goose, Claude Code, mcp-cli) to
-interact with it.
+## Adding a Tool
-### Project Conventions
-
-1. **ES Modules** — The project uses `"type": "module"` in package.json. Use `import`/`export`, not `require`.
-2. **Zod for schemas** — All tool input schemas are defined with Zod and automatically converted to JSON Schema by the MCP SDK.
-3. **Error handling pattern** — Every tool handler wraps its logic in `try/catch` and returns error text in the `content` array with `isError: true`.
-4. **No TypeScript** — The project is plain JavaScript with no build step.
-5. **Single-file server** — All MCP logic is in `server.js`. There is no router, no middleware, no framework beyond the MCP SDK.
-6. **MCP compliance** — Before modifying server behavior, read the [MCP spec](https://modelcontextprotocol.io/specification/2025-11-25). Don't violate it.
-
-### Adding a New Tool
-
-Follow this pattern in `server.js`:
+Before adding, ask: can this be a parameter on an existing tool? Would an LLM realistically call it? Can `execute_script` already do it?
+Pattern:
```js
-server.tool(
- "tool_name", // unique name (snake_case)
- "Human-readable description of the tool", // description
- { // Zod input schema
- param: z.string().describe("What this param does"),
- optionalParam: z.number().optional().describe("Optional param")
- },
- async ({ param, optionalParam }) => { // handler
- try {
- const driver = getDriver(); // get active browser session
- // ... do work with Selenium ...
- return {
- content: [{ type: 'text', text: 'Success message' }]
- };
- } catch (e) {
- return {
- content: [{ type: 'text', text: `Error: ${e.message}` }],
- isError: true // ← Don't forget this!
- };
- }
+server.tool("tool_name", "description", {
+ param: z.string().describe("short phrase")
+}, async ({ param }) => {
+ try {
+ const driver = getDriver();
+ // ... selenium work ...
+ return { content: [{ type: 'text', text: 'result' }] };
+ } catch (e) {
+ return { content: [{ type: 'text', text: `Error: ${e.message}` }], isError: true };
}
-);
+});
```
-After adding a tool:
-1. Add tests to the appropriate file in `test/` (see **Testing** below)
-2. Run `npm test` and confirm all tests pass
-3. Update `README.md` with the new tool's documentation
-
-### Adding a New Resource
-
-```js
-server.resource(
- "resource-name",
- new ResourceTemplate("scheme://path"),
- async (uri) => ({
- contents: [{
- uri: uri.href,
- mimeType: "text/plain", // ← Don't forget mimeType
- text: "Resource content"
- }]
- })
-);
-```
-
----
+After adding: add tests, run `npm test`, update README.
## Testing
-> **Testing philosophy: Verify outcomes, not absence of errors.** Every test must
-> assert that the action had the expected effect — not just that it didn't crash.
-> If you click a button, check that the thing it was supposed to do actually happened.
-> If you find an element, confirm it's the right one. If a test is failing, fix the
-> code or the test setup — never weaken the assertion to get a green check. A passing
-> test that proves nothing is worse than no test at all.
-
-The project has a regression test suite using Node's built-in `node:test` runner — zero external test dependencies.
-
-### Running Tests
-
```bash
npm test
```
-Requires Chrome + chromedriver on PATH. Tests run headless.
-
-### How It Works
+Requires Chrome + chromedriver on PATH. Tests run headless. Uses Node's built-in `node:test` runner — no external test dependencies.
-Tests talk to the real MCP server over stdio using JSON-RPC 2.0. No mocking.
+Tests talk to the real MCP server over stdio. No mocking. Each test file uses **one McpClient** (one server process) for the whole file — do not spin up multiple clients per file.
-- **`test/mcp-client.mjs`** — Reusable client that spawns the server, handles the MCP handshake, and provides `callTool()` / `listTools()` helpers.
-- **`test/fixtures/`** — HTML files loaded via `file://` URLs. Each test file uses its own fixture. Use the `fixture('name.html')` helper to resolve paths.
-
-### Test Files
+**Verify outcomes, not absence of errors.** If you click a button, check that the thing it did actually happened. If a test is failing, fix the code — never weaken the assertion.
| File | Covers |
|------|--------|
-| `server.test.mjs` | Server init, tool registration, schemas |
+| `server.test.mjs` | Tool registration, schemas |
| `browser.test.mjs` | start_browser, close_session, take_screenshot, multi-session |
-| `navigation.test.mjs` | navigate, all 6 locator strategies (id, css, xpath, name, tag, class) |
-| `interactions.test.mjs` | click, send_keys, get_element_text, hover, double_click, right_click, press_key, drag_and_drop, upload_file |
-| `bidi.test.mjs` | BiDi enablement, console log capture, page error capture, network log capture, session isolation |
-
-### When Adding a New Tool
-
-1. Add a fixture in `test/fixtures/` if the tool needs HTML elements not covered by existing fixtures
-2. Add tests to the appropriate `test/*.test.mjs` file (or create a new one)
-3. **Verify outcomes** — don't just check for "no error". Use `get_element_text` or other tools to confirm the action had the expected effect on the DOM
-4. Run `npm test` and confirm all tests pass
-
----
-
-## Common Pitfalls
-
-| Pitfall | Details |
-|---------|---------|
-| **"No active browser session"** | Most tools require `start_browser` to be called first. `getDriver()` throws if `state.currentSession` is null. |
-| **WebDriver not on PATH** | Selenium requires the browser's driver binary (chromedriver, geckodriver, etc.) to be installed and on PATH. |
-| **stdout pollution** | The server uses stdio transport. Any `console.log()` will corrupt the JSON-RPC stream. Use `console.error()` for debug output. |
-| **`send_keys` clears first** | The `send_keys` tool calls `element.clear()` before typing. This is intentional but may surprise users expecting append behavior. |
-| **No session switching** | Multiple sessions can exist in `state.drivers`, but there's no tool to switch `currentSession` between them. |
-| **Headless flag differs by browser** | Chrome/Edge use `--headless=new`, Firefox uses `--headless`. This is handled correctly in the code. |
+| `navigation.test.mjs` | navigate, locator strategies (id, css, xpath, name, tag, class) |
+| `interactions.test.mjs` | interact, send_keys, get_element_text, press_key, upload_file |
+| `tools.test.mjs` | get_element_attribute, execute_script, window, frame, alert |
+| `cookies.test.mjs` | add_cookie, get_cookies, delete_cookie |
+| `bidi.test.mjs` | diagnostics (console/errors/network), session isolation |
diff --git a/README.md b/README.md
index 67da24d..018d93a 100644
--- a/README.md
+++ b/README.md
@@ -1,848 +1,265 @@
# MCP Selenium Server
-A Model Context Protocol (MCP) server implementation for Selenium WebDriver, enabling browser automation through standardized MCP clients.
+A Model Context Protocol (MCP) server for Selenium WebDriver — browser automation for AI agents.
+
+[](https://youtu.be/mRV0N8hcgYA)
-## Video Demo (Click to Watch)
-
-[](https://youtu.be/mRV0N8hcgYA)
-
-
-## Features
-
-- Start browser sessions with customizable options
-- Navigate to URLs
-- Find elements using various locator strategies
-- Click, type, and interact with elements
-- Perform mouse actions (hover, drag and drop)
-- Handle keyboard input
-- Take screenshots
-- Upload files
-- Support for headless mode
-- Manage browser cookies (add, get, delete)
-- **Real-time diagnostics** via WebDriver BiDi:
- - Console log capture (info, warn, error)
- - JavaScript error detection with stack traces
- - Network request monitoring (successes and failures)
-
-## Supported Browsers
-
-- Chrome
-- Firefox
-- MS Edge
-- Safari (macOS only)
-
-> **Note:** Safari requires macOS with `safaridriver`, which is included with Safari.
-> Run `sudo safaridriver --enable` once, then enable "Allow Remote Automation"
-> in Safari → Settings → Developer. Safari does not support headless mode or
-> custom browser arguments.
+## Setup
-## Use with Goose
-
-### Option 1: One-click install
-Copy and paste the link below into a browser address bar to add this extension to goose desktop:
+
+Goose (Desktop)
+Paste into your browser address bar:
```
goose://extension?cmd=npx&arg=-y&arg=%40angiejones%2Fmcp-selenium&id=selenium-mcp&name=Selenium%20MCP&description=automates%20browser%20interactions
```
+
+
+Goose (CLI)
-### Option 2: Add manually to desktop or CLI
-
-* Name: `Selenium MCP`
-* Description: `automates browser interactions`
-* Command: `npx -y @angiejones/mcp-selenium`
-
-## Use with other MCP clients (e.g. Claude Code, Cursor, etc)
-```json
-{
- "mcpServers": {
- "selenium": {
- "command": "npx",
- "args": ["-y", "@angiejones/mcp-selenium"]
- }
- }
-}
-```
-
----
-
-## Development
-
-To work on this project:
-
-1. Clone the repository
-2. Install dependencies: `npm install`
-3. Run the server: `npm start`
-
-### Installation
-
-#### Installing via Smithery
-
-To install MCP Selenium for Claude automatically via [Smithery](https://smithery.ai/server/@angiejones/mcp-selenium):
-
-```bash
-npx -y @smithery/cli install @angiejones/mcp-selenium --client claude
-```
-
-#### Manual Installation
```bash
-npm install -g @angiejones/mcp-selenium
+goose session --with-extension "npx -y @angiejones/mcp-selenium"
```
+
-
-### Usage
-
-Start the server by running:
+
+Claude Code
```bash
-mcp-selenium
+claude mcp add selenium -- npx -y @angiejones/mcp-selenium
```
+
-Or use with NPX in your MCP configuration:
+
+Cursor / Windsurf / other MCP clients
```json
{
"mcpServers": {
"selenium": {
"command": "npx",
- "args": [
- "-y",
- "@angiejones/mcp-selenium"
- ]
- }
- }
-}
-```
-
-
-
-## Tools
-
-### start_browser
-Launches a browser session.
-
-**Parameters:**
-- `browser` (required): Browser to launch
- - Type: string
- - Enum: ["chrome", "firefox", "edge", "safari"]
-- `options`: Browser configuration options
- - Type: object
- - Properties:
- - `headless`: Run browser in headless mode
- - Type: boolean
- - `arguments`: Additional browser arguments
- - Type: array of strings
-
-**Example:**
-```json
-{
- "tool": "start_browser",
- "parameters": {
- "browser": "chrome",
- "options": {
- "headless": true,
- "arguments": ["--no-sandbox"]
+ "args": ["-y", "@angiejones/mcp-selenium"]
}
}
}
```
+
-### navigate
-Navigates to a URL.
-
-**Parameters:**
-- `url` (required): URL to navigate to
- - Type: string
-
-**Example:**
-```json
-{
- "tool": "navigate",
- "parameters": {
- "url": "https://www.example.com"
- }
-}
-```
-
-### find_element
-Finds an element on the page.
-
-**Parameters:**
-- `by` (required): Locator strategy
- - Type: string
- - Enum: ["id", "css", "xpath", "name", "tag", "class"]
-- `value` (required): Value for the locator strategy
- - Type: string
-- `timeout`: Maximum time to wait for element in milliseconds
- - Type: number
- - Default: 10000
-
-**Example:**
-```json
-{
- "tool": "find_element",
- "parameters": {
- "by": "id",
- "value": "search-input",
- "timeout": 5000
- }
-}
-```
-
-### click_element
-Clicks an element.
-
-**Parameters:**
-- `by` (required): Locator strategy
- - Type: string
- - Enum: ["id", "css", "xpath", "name", "tag", "class"]
-- `value` (required): Value for the locator strategy
- - Type: string
-- `timeout`: Maximum time to wait for element in milliseconds
- - Type: number
- - Default: 10000
-
-**Example:**
-```json
-{
- "tool": "click_element",
- "parameters": {
- "by": "css",
- "value": ".submit-button"
- }
-}
-```
-
-### send_keys
-Sends keys to an element (typing).
-
-**Parameters:**
-- `by` (required): Locator strategy
- - Type: string
- - Enum: ["id", "css", "xpath", "name", "tag", "class"]
-- `value` (required): Value for the locator strategy
- - Type: string
-- `text` (required): Text to enter into the element
- - Type: string
-- `timeout`: Maximum time to wait for element in milliseconds
- - Type: number
- - Default: 10000
-
-**Example:**
-```json
-{
- "tool": "send_keys",
- "parameters": {
- "by": "name",
- "value": "username",
- "text": "testuser"
- }
-}
-```
-
-### get_element_text
-Gets the text() of an element.
-
-**Parameters:**
-- `by` (required): Locator strategy
- - Type: string
- - Enum: ["id", "css", "xpath", "name", "tag", "class"]
-- `value` (required): Value for the locator strategy
- - Type: string
-- `timeout`: Maximum time to wait for element in milliseconds
- - Type: number
- - Default: 10000
-
-**Example:**
-```json
-{
- "tool": "get_element_text",
- "parameters": {
- "by": "css",
- "value": ".message"
- }
-}
-```
-
-### hover
-Moves the mouse to hover over an element.
-
-**Parameters:**
-- `by` (required): Locator strategy
- - Type: string
- - Enum: ["id", "css", "xpath", "name", "tag", "class"]
-- `value` (required): Value for the locator strategy
- - Type: string
-- `timeout`: Maximum time to wait for element in milliseconds
- - Type: number
- - Default: 10000
-
-**Example:**
-```json
-{
- "tool": "hover",
- "parameters": {
- "by": "css",
- "value": ".dropdown-menu"
- }
-}
-```
-
-### drag_and_drop
-Drags an element and drops it onto another element.
-
-**Parameters:**
-- `by` (required): Locator strategy for source element
- - Type: string
- - Enum: ["id", "css", "xpath", "name", "tag", "class"]
-- `value` (required): Value for the source locator strategy
- - Type: string
-- `targetBy` (required): Locator strategy for target element
- - Type: string
- - Enum: ["id", "css", "xpath", "name", "tag", "class"]
-- `targetValue` (required): Value for the target locator strategy
- - Type: string
-- `timeout`: Maximum time to wait for elements in milliseconds
- - Type: number
- - Default: 10000
-
-**Example:**
-```json
-{
- "tool": "drag_and_drop",
- "parameters": {
- "by": "id",
- "value": "draggable",
- "targetBy": "id",
- "targetValue": "droppable"
- }
-}
-```
-
-### double_click
-Performs a double click on an element.
-
-**Parameters:**
-- `by` (required): Locator strategy
- - Type: string
- - Enum: ["id", "css", "xpath", "name", "tag", "class"]
-- `value` (required): Value for the locator strategy
- - Type: string
-- `timeout`: Maximum time to wait for element in milliseconds
- - Type: number
- - Default: 10000
-
-**Example:**
-```json
-{
- "tool": "double_click",
- "parameters": {
- "by": "css",
- "value": ".editable-text"
- }
-}
-```
-
-### right_click
-Performs a right click (context click) on an element.
-
-**Parameters:**
-- `by` (required): Locator strategy
- - Type: string
- - Enum: ["id", "css", "xpath", "name", "tag", "class"]
-- `value` (required): Value for the locator strategy
- - Type: string
-- `timeout`: Maximum time to wait for element in milliseconds
- - Type: number
- - Default: 10000
-
-**Example:**
-```json
-{
- "tool": "right_click",
- "parameters": {
- "by": "css",
- "value": ".context-menu-trigger"
- }
-}
-```
-
-### press_key
-Simulates pressing a keyboard key.
+## Example Usage
-**Parameters:**
-- `key` (required): Key to press (e.g., 'Enter', 'Tab', 'a', etc.)
- - Type: string
+Tell the AI agent of your choice:
-**Example:**
-```json
-{
- "tool": "press_key",
- "parameters": {
- "key": "Enter"
- }
-}
-```
+> Open Chrome, go to github.com/angiejones, and take a screenshot.
-### upload_file
-Uploads a file using a file input element.
-
-**Parameters:**
-- `by` (required): Locator strategy
- - Type: string
- - Enum: ["id", "css", "xpath", "name", "tag", "class"]
-- `value` (required): Value for the locator strategy
- - Type: string
-- `filePath` (required): Absolute path to the file to upload
- - Type: string
-- `timeout`: Maximum time to wait for element in milliseconds
- - Type: number
- - Default: 10000
-
-**Example:**
-```json
-{
- "tool": "upload_file",
- "parameters": {
- "by": "id",
- "value": "file-input",
- "filePath": "/path/to/file.pdf"
- }
-}
-```
-
-### take_screenshot
-Captures a screenshot of the current page.
-
-**Parameters:**
-- `outputPath` (optional): Path where to save the screenshot. If not provided, returns base64 data.
- - Type: string
+The agent will call Selenium's APIs to `start_browser`, `navigate`, and `take_screenshot`. No manual scripting or explicit directions needed.
-**Example:**
-```json
-{
- "tool": "take_screenshot",
- "parameters": {
- "outputPath": "/path/to/screenshot.png"
- }
-}
-```
+## Supported Browsers
-### close_session
-Closes the current browser session and cleans up resources.
+Chrome, Firefox, Edge, and Safari.
-**Parameters:**
-None required
+> **Safari note:** Requires macOS. Run `sudo safaridriver --enable` once and enable
+> "Allow Remote Automation" in Safari → Settings → Developer. No headless mode.
-**Example:**
-```json
-{
- "tool": "close_session",
- "parameters": {}
-}
-```
+---
+
+Tools
-### clear_element
-Clears the content of an input or textarea element.
+### start_browser
+Launches a browser session.
-**Parameters:**
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
-| by | string | Yes | Locator strategy (id, css, xpath, name, tag, class) |
-| value | string | Yes | Value for the locator strategy |
-
-**Example:**
-```json
-{
- "tool": "clear_element",
- "parameters": {
- "by": "id",
- "value": "search-input"
- }
-}
-```
+| browser | string | Yes | `chrome`, `firefox`, `edge`, or `safari` |
+| options | object | No | `{ headless: boolean, arguments: string[] }` |
-### get_element_attribute
-Gets the value of an attribute from an element.
+### navigate
+Navigates to a URL.
-**Parameters:**
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
-| by | string | Yes | Locator strategy (id, css, xpath, name, tag, class) |
-| value | string | Yes | Value for the locator strategy |
-| attribute | string | Yes | Name of the attribute to retrieve |
+| url | string | Yes | URL to navigate to |
-**Example:**
-```json
-{
- "tool": "get_element_attribute",
- "parameters": {
- "by": "id",
- "value": "my-link",
- "attribute": "href"
- }
-}
-```
-
-### scroll_to_element
-Scrolls the page to make an element visible.
+### interact
+Performs a mouse action on an element.
-**Parameters:**
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
-| by | string | Yes | Locator strategy (id, css, xpath, name, tag, class) |
+| action | string | Yes | `click`, `doubleclick`, `rightclick`, or `hover` |
+| by | string | Yes | Locator strategy: `id`, `css`, `xpath`, `name`, `tag`, `class` |
| value | string | Yes | Value for the locator strategy |
+| timeout | number | No | Max wait in ms (default: 10000) |
-**Example:**
-```json
-{
- "tool": "scroll_to_element",
- "parameters": {
- "by": "id",
- "value": "footer"
- }
-}
-```
-
-### execute_script
-Executes JavaScript in the browser and returns the result.
+### send_keys
+Types text into an element. Clears the field first.
-**Parameters:**
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
-| script | string | Yes | JavaScript code to execute |
-| args | array | No | Arguments to pass to the script (accessible via `arguments[0]`, `arguments[1]`, etc.) |
+| by | string | Yes | Locator strategy |
+| value | string | Yes | Locator value |
+| text | string | Yes | Text to enter |
+| timeout | number | No | Max wait in ms (default: 10000) |
-**Example:**
-```json
-{
- "tool": "execute_script",
- "parameters": {
- "script": "return document.title;"
- }
-}
-```
-
-**Example with arguments:**
-```json
-{
- "tool": "execute_script",
- "parameters": {
- "script": "return arguments[0] + arguments[1];",
- "args": [10, 32]
- }
-}
-```
-
-### get_window_handles
-Returns a list of all window/tab handles in the current session.
-
-**Parameters:**
-None required
-
-**Example:**
-```json
-{
- "tool": "get_window_handles",
- "parameters": {}
-}
-```
-
-### switch_to_window
-Switches focus to a specific window or tab by its handle.
+### get_element_text
+Gets the text content of an element.
-**Parameters:**
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
-| handle | string | Yes | The window handle to switch to |
-
-**Example:**
-```json
-{
- "tool": "switch_to_window",
- "parameters": {
- "handle": "CDwindow-1234"
- }
-}
-```
-
-### switch_to_latest_window
-Switches focus to the most recently opened window or tab.
-
-**Parameters:**
-None required
-
-**Example:**
-```json
-{
- "tool": "switch_to_latest_window",
- "parameters": {}
-}
-```
-
-### close_current_window
-Closes the current window/tab and switches back to the previous one.
-
-**Parameters:**
-None required
+| by | string | Yes | Locator strategy |
+| value | string | Yes | Locator value |
+| timeout | number | No | Max wait in ms (default: 10000) |
-**Example:**
-```json
-{
- "tool": "close_current_window",
- "parameters": {}
-}
-```
-
-### switch_to_frame
-Switches focus to an iframe or frame within the page. Provide either `by`/`value` to locate the frame by element, or `index` to switch by position.
+### get_element_attribute
+Gets an attribute value from an element.
-**Parameters:**
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
-| by | string | No | Locator strategy (id, css, xpath, name, tag, class) |
-| value | string | No | Value for the locator strategy |
-| index | number | No | Frame index (0-based) |
-| timeout | number | No | Max wait time in ms (default: 10000) |
+| by | string | Yes | Locator strategy |
+| value | string | Yes | Locator value |
+| attribute | string | Yes | Attribute name (e.g., `href`, `value`, `class`) |
+| timeout | number | No | Max wait in ms (default: 10000) |
-**Example (by locator):**
-```json
-{
- "tool": "switch_to_frame",
- "parameters": {
- "by": "id",
- "value": "my-iframe"
- }
-}
-```
-
-**Example (by index):**
-```json
-{
- "tool": "switch_to_frame",
- "parameters": {
- "index": 0
- }
-}
-```
+### press_key
+Presses a keyboard key.
-### switch_to_default_content
-Switches focus back to the main page from an iframe.
+| Parameter | Type | Required | Description |
+|-----------|------|----------|-------------|
+| key | string | Yes | Key to press (e.g., `Enter`, `Tab`, `a`) |
-**Parameters:**
-None required
+### upload_file
+Uploads a file via a file input element.
-**Example:**
-```json
-{
- "tool": "switch_to_default_content",
- "parameters": {}
-}
-```
+| Parameter | Type | Required | Description |
+|-----------|------|----------|-------------|
+| by | string | Yes | Locator strategy |
+| value | string | Yes | Locator value |
+| filePath | string | Yes | Absolute path to the file |
+| timeout | number | No | Max wait in ms (default: 10000) |
-### accept_alert
-Accepts (clicks OK on) a browser alert, confirm, or prompt dialog.
+### take_screenshot
+Captures a screenshot of the current page.
-**Parameters:**
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
-| timeout | number | No | Max wait time in ms (default: 5000) |
+| outputPath | string | No | Save path. If omitted, returns base64 image data. |
-**Example:**
-```json
-{
- "tool": "accept_alert",
- "parameters": {}
-}
-```
+### close_session
+Closes the current browser session. No parameters.
-### dismiss_alert
-Dismisses (clicks Cancel on) a browser alert or confirm dialog.
+### execute_script
+Executes JavaScript in the browser. Use for advanced interactions not covered by other tools (e.g., drag and drop, scrolling, reading computed styles, DOM manipulation).
-**Parameters:**
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
-| timeout | number | No | Max wait time in ms (default: 5000) |
-
-**Example:**
-```json
-{
- "tool": "dismiss_alert",
- "parameters": {}
-}
-```
+| script | string | Yes | JavaScript code to execute |
+| args | array | No | Arguments accessible via `arguments[0]`, etc. |
-### get_alert_text
-Gets the text content of a browser alert, confirm, or prompt dialog.
+### window
+Manages browser windows and tabs.
-**Parameters:**
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
-| timeout | number | No | Max wait time in ms (default: 5000) |
+| action | string | Yes | `list`, `switch`, `switch_latest`, or `close` |
+| handle | string | No | Window handle (required for `switch`) |
-**Example:**
-```json
-{
- "tool": "get_alert_text",
- "parameters": {}
-}
-```
-
-### send_alert_text
-Types text into a browser prompt dialog and accepts it.
+### frame
+Switches focus to a frame or back to the main page.
-**Parameters:**
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
-| text | string | Yes | Text to type into the prompt |
-| timeout | number | No | Max wait time in ms (default: 5000) |
+| action | string | Yes | `switch` or `default` |
+| by | string | No | Locator strategy (for `switch`) |
+| value | string | No | Locator value (for `switch`) |
+| index | number | No | Frame index, 0-based (for `switch`) |
+| timeout | number | No | Max wait in ms (default: 10000) |
-**Example:**
-```json
-{
- "tool": "send_alert_text",
- "parameters": {
- "text": "my input"
- }
-}
-```
+### alert
+Handles browser alert, confirm, or prompt dialogs.
+| Parameter | Type | Required | Description |
+|-----------|------|----------|-------------|
+| action | string | Yes | `accept`, `dismiss`, `get_text`, or `send_text` |
+| text | string | No | Text to send (required for `send_text`) |
+| timeout | number | No | Max wait in ms (default: 5000) |
### add_cookie
-Adds a cookie to the current browser session.
-
-**Parameters:**
+Adds a cookie. Browser must be on a page from the cookie's domain.
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
| name | string | Yes | Cookie name |
| value | string | Yes | Cookie value |
| domain | string | No | Cookie domain |
-| path | string | No | Cookie path (default: /) |
-| secure | boolean | No | Whether the cookie is secure |
-| httpOnly | boolean | No | Whether the cookie is HTTP-only |
-| expiry | number | No | Cookie expiry as Unix timestamp |
-
-**Example:**
-```json
-{
- "tool": "add_cookie",
- "parameters": {
- "name": "session_id",
- "value": "abc123",
- "path": "/",
- "httpOnly": true
- }
-}
-```
+| path | string | No | Cookie path |
+| secure | boolean | No | Secure flag |
+| httpOnly | boolean | No | HTTP-only flag |
+| expiry | number | No | Unix timestamp |
### get_cookies
-Retrieves cookies from the current browser session. Returns all cookies or a specific cookie by name.
-
-**Parameters:**
+Gets cookies. Returns all or a specific one by name.
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
-| name | string | No | Cookie name to retrieve. If omitted, returns all cookies. |
-
-**Example — get all cookies:**
-```json
-{
- "tool": "get_cookies",
- "parameters": {}
-}
-```
-
-**Example — get a specific cookie:**
-```json
-{
- "tool": "get_cookies",
- "parameters": {
- "name": "session_id"
- }
-}
-```
+| name | string | No | Cookie name. Omit for all cookies. |
### delete_cookie
-Deletes cookies from the current browser session. Deletes a specific cookie by name, or all cookies if no name is provided.
+Deletes cookies. Deletes all or a specific one by name.
+
+| Parameter | Type | Required | Description |
+|-----------|------|----------|-------------|
+| name | string | No | Cookie name. Omit to delete all. |
-**Parameters:**
+### diagnostics
+Gets browser diagnostics captured via WebDriver BiDi (auto-enabled when supported).
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
-| name | string | No | Cookie name to delete. If omitted, deletes all cookies. |
+| type | string | Yes | `console`, `errors`, or `network` |
+| clear | boolean | No | Clear buffer after returning (default: false) |
-**Example — delete a specific cookie:**
-```json
-{
- "tool": "delete_cookie",
- "parameters": {
- "name": "session_id"
- }
-}
-```
+
-**Example — delete all cookies:**
-```json
-{
- "tool": "delete_cookie",
- "parameters": {}
-}
-```
+---
-### get_console_logs
-Retrieves captured browser console messages (log, warn, error, etc.). Console logs are automatically captured in the background via WebDriver BiDi when the browser supports it — no configuration needed.
+
+Development
-**Parameters:**
-| Parameter | Type | Required | Description |
-|-----------|------|----------|-------------|
-| clear | boolean | No | Clear the captured logs after retrieving them (default: false) |
+### Setup
-**Example:**
-```json
-{
- "tool": "get_console_logs",
- "parameters": {}
-}
+```bash
+git clone https://github.com/angiejones/mcp-selenium.git
+cd mcp-selenium
+npm install
```
-### get_page_errors
-Retrieves captured JavaScript errors and uncaught exceptions with full stack traces. Errors are automatically captured in the background via WebDriver BiDi.
+### Run Tests
-**Parameters:**
-| Parameter | Type | Required | Description |
-|-----------|------|----------|-------------|
-| clear | boolean | No | Clear the captured errors after retrieving them (default: false) |
-
-**Example:**
-```json
-{
- "tool": "get_page_errors",
- "parameters": {}
-}
+```bash
+npm test
```
-### get_network_logs
-Retrieves captured network activity including successful responses and failed requests. Network logs are automatically captured in the background via WebDriver BiDi.
+Requires Chrome + chromedriver on PATH. Tests run headless.
-**Parameters:**
-| Parameter | Type | Required | Description |
-|-----------|------|----------|-------------|
-| clear | boolean | No | Clear the captured logs after retrieving them (default: false) |
+### Install via Smithery
-**Example:**
-```json
-{
- "tool": "get_network_logs",
- "parameters": {}
-}
+```bash
+npx -y @smithery/cli install @angiejones/mcp-selenium --client claude
+```
+
+### Install globally
+
+```bash
+npm install -g @angiejones/mcp-selenium
+mcp-selenium
```
+
+
## License
MIT
diff --git a/package-lock.json b/package-lock.json
index 94034a8..ee3dc75 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
{
"name": "@angiejones/mcp-selenium",
- "version": "0.1.21",
+ "version": "0.2.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "@angiejones/mcp-selenium",
- "version": "0.1.21",
+ "version": "0.2.0",
"license": "ISC",
"dependencies": {
"@modelcontextprotocol/sdk": "^1.7.0",
@@ -345,6 +345,7 @@
"resolved": "https://registry.npmjs.org/express/-/express-5.0.1.tgz",
"integrity": "sha512-ORF7g6qGnD+YtUG9yx4DFoqCShNMmUKiXuT5oWMHiOvt/4WFbHC6yCwQMTSBMno7AqntNCAzzcnnjowRkTL9eQ==",
"license": "MIT",
+ "peer": true,
"dependencies": {
"accepts": "^2.0.0",
"body-parser": "^2.0.1",
@@ -1205,6 +1206,7 @@
"resolved": "https://registry.npmjs.org/zod/-/zod-3.24.2.tgz",
"integrity": "sha512-lY7CDW43ECgW9u1TcT3IoXHflywfVqDYze4waEz812jR/bZ8FHDsl7pFQoSZTz5N+2NqRXs8GBwnAwo3ZNxqhQ==",
"license": "MIT",
+ "peer": true,
"funding": {
"url": "https://github.com/sponsors/colinhacks"
}
diff --git a/package.json b/package.json
index d7bdbdf..19d5365 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
{
"name": "@angiejones/mcp-selenium",
- "version": "0.1.21",
+ "version": "0.2.0",
"description": "Selenium WebDriver MCP Server",
"type": "module",
"main": "src/lib/server.js",
diff --git a/src/lib/server.js b/src/lib/server.js
index 4f3c04c..8ff5b11 100755
--- a/src/lib/server.js
+++ b/src/lib/server.js
@@ -10,6 +10,16 @@ import { Options as FirefoxOptions } from 'selenium-webdriver/firefox.js';
import { Options as EdgeOptions } from 'selenium-webdriver/edge.js';
import { Options as SafariOptions } from 'selenium-webdriver/safari.js';
+// Create an MCP server
+import { createRequire } from 'module';
+const require = createRequire(import.meta.url);
+const { version } = require('../../package.json');
+
+const server = new McpServer({
+ name: "MCP Selenium",
+ version
+});
+
// BiDi imports — loaded dynamically to avoid hard failures if not available
let LogInspector, Network;
try {
@@ -22,13 +32,6 @@ try {
Network = null;
}
-
-// Create an MCP server
-const server = new McpServer({
- name: "MCP Selenium",
- version: "1.0.0"
-});
-
// Server state
const state = {
drivers: new Map(),
@@ -108,29 +111,6 @@ async function setupBidi(driver, sessionId) {
state.bidi.set(sessionId, bidi);
}
-function registerBidiTool(name, description, logKey, emptyMessage, unavailableMessage) {
- server.tool(
- name,
- description,
- { clear: z.boolean().optional().describe("Clear after returning (default: false)") },
- async ({ clear = false }) => {
- try {
- getDriver();
- const bidi = state.bidi.get(state.currentSession);
- if (!bidi?.available) {
- return { content: [{ type: 'text', text: unavailableMessage }] };
- }
- const logs = bidi[logKey];
- const result = logs.length === 0 ? emptyMessage : JSON.stringify(logs, null, 2);
- if (clear) bidi[logKey] = [];
- return { content: [{ type: 'text', text: result }] };
- } catch (e) {
- return { content: [{ type: 'text', text: `Error: ${e.message}` }], isError: true };
- }
- }
- );
-}
-
// Common schemas
const browserOptionsSchema = z.object({
headless: z.boolean().optional().describe("Run browser in headless mode"),
@@ -160,7 +140,7 @@ server.tool(
// Enable BiDi websocket if the modules are available
if (LogInspector && Network) {
// 'ignore' prevents BiDi from auto-dismissing alert/confirm/prompt dialogs,
- // allowing accept_alert, dismiss_alert, and get_alert_text to work as expected.
+ // allowing the alert tool's accept, dismiss, and get_text actions to work as expected.
builder = builder.withCapabilities({ 'webSocketUrl': true, 'unhandledPromptBehavior': 'ignore' });
}
@@ -282,46 +262,43 @@ server.tool(
// Element Interaction Tools
server.tool(
- "find_element",
- "finds an element",
+ "interact",
+ "performs a mouse action on an element",
{
+ action: z.enum(["click", "doubleclick", "rightclick", "hover"]).describe("Mouse action to perform"),
...locatorSchema
},
- async ({ by, value, timeout = 10000 }) => {
- try {
- const driver = getDriver();
- const locator = getLocator(by, value);
- await driver.wait(until.elementLocated(locator), timeout);
- return {
- content: [{ type: 'text', text: 'Element found' }]
- };
- } catch (e) {
- return {
- content: [{ type: 'text', text: `Error finding element: ${e.message}` }],
- isError: true
- };
- }
- }
-);
-
-server.tool(
- "click_element",
- "clicks an element",
- {
- ...locatorSchema
- },
- async ({ by, value, timeout = 10000 }) => {
+ async ({ action, by, value, timeout = 10000 }) => {
try {
const driver = getDriver();
const locator = getLocator(by, value);
const element = await driver.wait(until.elementLocated(locator), timeout);
- await element.click();
- return {
- content: [{ type: 'text', text: 'Element clicked' }]
- };
+
+ switch (action) {
+ case 'click':
+ await element.click();
+ return { content: [{ type: 'text', text: 'Element clicked' }] };
+ case 'doubleclick': {
+ const dblActions = driver.actions({ bridge: true });
+ await dblActions.doubleClick(element).perform();
+ return { content: [{ type: 'text', text: 'Double click performed' }] };
+ }
+ case 'rightclick': {
+ const ctxActions = driver.actions({ bridge: true });
+ await ctxActions.contextClick(element).perform();
+ return { content: [{ type: 'text', text: 'Right click performed' }] };
+ }
+ case 'hover': {
+ const hoverActions = driver.actions({ bridge: true });
+ await hoverActions.move({ origin: element }).perform();
+ return { content: [{ type: 'text', text: 'Hovered over element' }] };
+ }
+ default:
+ return { content: [{ type: 'text', text: `Unknown action: ${action}` }], isError: true };
+ }
} catch (e) {
return {
- content: [{ type: 'text', text: `Error clicking element: ${e.message}` }],
+ content: [{ type: 'text', text: `Error performing ${action}: ${e.message}` }],
isError: true
};
}
@@ -330,7 +307,7 @@ server.tool(
server.tool(
"send_keys",
- "sends keys to an element, aka typing",
+ "sends keys to an element, aka typing. Clears the field first.",
{
...locatorSchema,
text: z.string().describe("Text to enter into the element")
@@ -356,7 +333,7 @@ server.tool(
server.tool(
"get_element_text",
- "gets the text() of an element",
+ "gets the text content of an element",
{
...locatorSchema
},
@@ -378,110 +355,6 @@ server.tool(
}
);
-server.tool(
- "hover",
- "moves the mouse to hover over an element",
- {
- ...locatorSchema
- },
- async ({ by, value, timeout = 10000 }) => {
- try {
- const driver = getDriver();
- const locator = getLocator(by, value);
- const element = await driver.wait(until.elementLocated(locator), timeout);
- const actions = driver.actions({ bridge: true });
- await actions.move({ origin: element }).perform();
- return {
- content: [{ type: 'text', text: 'Hovered over element' }]
- };
- } catch (e) {
- return {
- content: [{ type: 'text', text: `Error hovering over element: ${e.message}` }],
- isError: true
- };
- }
- }
-);
-
-server.tool(
- "drag_and_drop",
- "drags an element and drops it onto another element",
- {
- ...locatorSchema,
- targetBy: z.enum(["id", "css", "xpath", "name", "tag", "class"]).describe("Locator strategy to find target element"),
- targetValue: z.string().describe("Value for the target locator strategy")
- },
- async ({ by, value, targetBy, targetValue, timeout = 10000 }) => {
- try {
- const driver = getDriver();
- const sourceLocator = getLocator(by, value);
- const targetLocator = getLocator(targetBy, targetValue);
- const sourceElement = await driver.wait(until.elementLocated(sourceLocator), timeout);
- const targetElement = await driver.wait(until.elementLocated(targetLocator), timeout);
- const actions = driver.actions({ bridge: true });
- await actions.dragAndDrop(sourceElement, targetElement).perform();
- return {
- content: [{ type: 'text', text: 'Drag and drop completed' }]
- };
- } catch (e) {
- return {
- content: [{ type: 'text', text: `Error performing drag and drop: ${e.message}` }],
- isError: true
- };
- }
- }
-);
-
-server.tool(
- "double_click",
- "performs a double click on an element",
- {
- ...locatorSchema
- },
- async ({ by, value, timeout = 10000 }) => {
- try {
- const driver = getDriver();
- const locator = getLocator(by, value);
- const element = await driver.wait(until.elementLocated(locator), timeout);
- const actions = driver.actions({ bridge: true });
- await actions.doubleClick(element).perform();
- return {
- content: [{ type: 'text', text: 'Double click performed' }]
- };
- } catch (e) {
- return {
- content: [{ type: 'text', text: `Error performing double click: ${e.message}` }],
- isError: true
- };
- }
- }
-);
-
-server.tool(
- "right_click",
- "performs a right click (context click) on an element",
- {
- ...locatorSchema
- },
- async ({ by, value, timeout = 10000 }) => {
- try {
- const driver = getDriver();
- const locator = getLocator(by, value);
- const element = await driver.wait(until.elementLocated(locator), timeout);
- const actions = driver.actions({ bridge: true });
- await actions.contextClick(element).perform();
- return {
- content: [{ type: 'text', text: 'Right click performed' }]
- };
- } catch (e) {
- return {
- content: [{ type: 'text', text: `Error performing right click: ${e.message}` }],
- isError: true
- };
- }
- }
-);
-
server.tool(
"press_key",
"simulates pressing a keyboard key",
@@ -491,8 +364,6 @@ server.tool(
async ({ key }) => {
try {
const driver = getDriver();
- // Map named keys to Selenium Key constants (case-insensitive).
- // Single characters are passed through as-is.
const resolvedKey = key.length === 1
? key
: Key[key.toUpperCase().replace(/ /g, '_')] ?? null;
@@ -601,30 +472,6 @@ server.tool(
);
// Element Utility Tools
-server.tool(
- "clear_element",
- "clears the content of an input or textarea element",
- {
- ...locatorSchema
- },
- async ({ by, value, timeout = 10000 }) => {
- try {
- const driver = getDriver();
- const locator = getLocator(by, value);
- const element = await driver.wait(until.elementLocated(locator), timeout);
- await element.clear();
- return {
- content: [{ type: 'text', text: 'Element cleared' }]
- };
- } catch (e) {
- return {
- content: [{ type: 'text', text: `Error clearing element: ${e.message}` }],
- isError: true
- };
- }
- }
-);
-
server.tool(
"get_element_attribute",
"gets the value of an attribute on an element",
@@ -650,33 +497,9 @@ server.tool(
}
);
-server.tool(
- "scroll_to_element",
- "scrolls the page until an element is visible",
- {
- ...locatorSchema
- },
- async ({ by, value, timeout = 10000 }) => {
- try {
- const driver = getDriver();
- const locator = getLocator(by, value);
- const element = await driver.wait(until.elementLocated(locator), timeout);
- await driver.executeScript("arguments[0].scrollIntoView({block: 'center'});", element);
- return {
- content: [{ type: 'text', text: 'Scrolled to element' }]
- };
- } catch (e) {
- return {
- content: [{ type: 'text', text: `Error scrolling to element: ${e.message}` }],
- isError: true
- };
- }
- }
-);
-
server.tool(
"execute_script",
- "executes JavaScript in the browser and returns the result",
+ "executes JavaScript in the browser and returns the result. Use for advanced interactions not covered by other tools (e.g., drag and drop, scrolling, reading computed styles, manipulating the DOM directly).",
{
script: z.string().describe("JavaScript code to execute in the browser"),
args: z.array(z.any()).optional().describe("Optional arguments to pass to the script (accessible via arguments[0], arguments[1], etc.)")
@@ -700,125 +523,81 @@ server.tool(
}
);
-// Window/Tab Management Tools
+// Window/Tab Management
server.tool(
- "switch_to_window",
- "switches to a specific browser window or tab by handle",
+ "window",
+ "manages browser windows and tabs",
{
- handle: z.string().describe("Window handle to switch to")
+ action: z.enum(["list", "switch", "switch_latest", "close"]).describe("Window action to perform"),
+ handle: z.string().optional().describe("Window handle (required for switch)")
},
- async ({ handle }) => {
+ async ({ action, handle }) => {
try {
const driver = getDriver();
- await driver.switchTo().window(handle);
- return {
- content: [{ type: 'text', text: `Switched to window: ${handle}` }]
- };
- } catch (e) {
- return {
- content: [{ type: 'text', text: `Error switching window: ${e.message}` }],
- isError: true
- };
- }
- }
-);
-
-server.tool(
- "get_window_handles",
- "returns all window/tab handles for the current session",
- {},
- async () => {
- try {
- const driver = getDriver();
- const handles = await driver.getAllWindowHandles();
- const current = await driver.getWindowHandle();
- return {
- content: [{ type: 'text', text: JSON.stringify({ current, all: handles }, null, 2) }]
- };
- } catch (e) {
- return {
- content: [{ type: 'text', text: `Error getting window handles: ${e.message}` }],
- isError: true
- };
- }
- }
-);
-
-server.tool(
- "switch_to_latest_window",
- "switches to the most recently opened window or tab",
- {},
- async () => {
- try {
- const driver = getDriver();
- const handles = await driver.getAllWindowHandles();
- if (handles.length === 0) {
- throw new Error('No windows available');
- }
- const latest = handles[handles.length - 1];
- await driver.switchTo().window(latest);
- return {
- content: [{ type: 'text', text: `Switched to latest window: ${latest}` }]
- };
- } catch (e) {
- return {
- content: [{ type: 'text', text: `Error switching to latest window: ${e.message}` }],
- isError: true
- };
- }
- }
-);
-
-server.tool(
- "close_current_window",
- "closes the current window/tab and switches back to the first remaining window",
- {},
- async () => {
- try {
- const driver = getDriver();
- await driver.close();
- const handles = await driver.getAllWindowHandles();
- if (handles.length > 0) {
- await driver.switchTo().window(handles[0]);
- return {
- content: [{ type: 'text', text: `Window closed. Switched to: ${handles[0]}` }]
- };
- }
- // Last window closed — quit the driver and clean up the session
- const sessionId = state.currentSession;
- try {
- await driver.quit();
- } catch (quitError) {
- console.error(`Error quitting driver for session ${sessionId}:`, quitError);
+ switch (action) {
+ case 'list': {
+ const handles = await driver.getAllWindowHandles();
+ const current = await driver.getWindowHandle();
+ return { content: [{ type: 'text', text: JSON.stringify({ current, all: handles }, null, 2) }] };
+ }
+ case 'switch': {
+ if (!handle) throw new Error('handle is required for switch action');
+ await driver.switchTo().window(handle);
+ return { content: [{ type: 'text', text: `Switched to window: ${handle}` }] };
+ }
+ case 'switch_latest': {
+ const handles = await driver.getAllWindowHandles();
+ if (handles.length === 0) throw new Error('No windows available');
+ const latest = handles[handles.length - 1];
+ await driver.switchTo().window(latest);
+ return { content: [{ type: 'text', text: `Switched to latest window: ${latest}` }] };
+ }
+ case 'close': {
+ await driver.close();
+ let handles = [];
+ try { handles = await driver.getAllWindowHandles(); } catch (_) { /* session gone */ }
+ if (handles.length > 0) {
+ await driver.switchTo().window(handles[0]);
+ return { content: [{ type: 'text', text: `Window closed. Switched to: ${handles[0]}` }] };
+ }
+ const sessionId = state.currentSession;
+ try { await driver.quit(); } catch (_) { /* ignore */ }
+ state.drivers.delete(sessionId);
+ state.bidi.delete(sessionId);
+ state.currentSession = null;
+ return { content: [{ type: 'text', text: 'Last window closed. Session ended.' }] };
+ }
+ default:
+ return { content: [{ type: 'text', text: `Unknown action: ${action}` }], isError: true };
}
- state.drivers.delete(sessionId);
- state.bidi.delete(sessionId);
- state.currentSession = null;
- return {
- content: [{ type: 'text', text: 'Last window closed. Session ended.' }]
- };
} catch (e) {
return {
- content: [{ type: 'text', text: `Error closing window: ${e.message}` }],
+ content: [{ type: 'text', text: `Error in window ${action}: ${e.message}` }],
isError: true
};
}
}
);
-// Frame Management Tools
+// Frame Management
server.tool(
- "switch_to_frame",
- "switches focus to an iframe or frame within the page. Provide either by/value to locate by element, or index to switch by position.",
+ "frame",
+ "switches focus to a frame or back to the main page",
{
- by: z.enum(["id", "css", "xpath", "name", "tag", "class"]).optional().describe("Locator strategy to find frame element"),
+ action: z.enum(["switch", "default"]).describe("Frame action to perform"),
+ by: z.enum(["id", "css", "xpath", "name", "tag", "class"]).optional().describe("Locator strategy for frame element"),
value: z.string().optional().describe("Value for the locator strategy"),
- index: z.number().optional().describe("Frame index (0-based) to switch to by position"),
- timeout: z.number().optional().describe("Maximum time to wait for frame in milliseconds")
+ index: z.number().optional().describe("Frame index (0-based)"),
+ timeout: z.number().optional().describe("Max wait in ms")
},
- async ({ by, value, index, timeout = 10000 }) => {
+ async ({ action, by, value, index, timeout = 10000 }) => {
try {
const driver = getDriver();
+ if (action === 'default') {
+ await driver.switchTo().defaultContent();
+ return { content: [{ type: 'text', text: 'Switched to default content' }] };
+ }
+ // action === 'switch'
if (index !== undefined) {
await driver.switchTo().frame(index);
} else if (by && value) {
@@ -826,133 +605,55 @@ server.tool(
const element = await driver.wait(until.elementLocated(locator), timeout);
await driver.switchTo().frame(element);
} else {
- throw new Error('Provide either by/value to locate frame by element, or index to switch by position');
+ throw new Error('Provide either by/value to locate frame, or index to switch by position');
}
- return {
- content: [{ type: 'text', text: `Switched to frame` }]
- };
+ return { content: [{ type: 'text', text: 'Switched to frame' }] };
} catch (e) {
return {
- content: [{ type: 'text', text: `Error switching to frame: ${e.message}` }],
+ content: [{ type: 'text', text: `Error in frame ${action}: ${e.message}` }],
isError: true
};
}
}
);
+// Alert/Dialog Handling
server.tool(
- "switch_to_default_content",
- "switches focus back to the main page from an iframe",
- {},
- async () => {
- try {
- const driver = getDriver();
- await driver.switchTo().defaultContent();
- return {
- content: [{ type: 'text', text: 'Switched to default content' }]
- };
- } catch (e) {
- return {
- content: [{ type: 'text', text: `Error switching to default content: ${e.message}` }],
- isError: true
- };
- }
- }
-);
-
-// Alert/Dialog Tools
-server.tool(
- "accept_alert",
- "accepts (clicks OK) on a browser alert, confirm, or prompt dialog",
- {
- timeout: z.number().optional().describe("Maximum time to wait for alert in milliseconds")
- },
- async ({ timeout = 5000 }) => {
- try {
- const driver = getDriver();
- await driver.wait(until.alertIsPresent(), timeout);
- const alert = await driver.switchTo().alert();
- await alert.accept();
- return {
- content: [{ type: 'text', text: 'Alert accepted' }]
- };
- } catch (e) {
- return {
- content: [{ type: 'text', text: `Error accepting alert: ${e.message}` }],
- isError: true
- };
- }
- }
-);
-
-server.tool(
- "dismiss_alert",
- "dismisses (clicks Cancel) on a browser alert, confirm, or prompt dialog",
- {
- timeout: z.number().optional().describe("Maximum time to wait for alert in milliseconds")
- },
- async ({ timeout = 5000 }) => {
- try {
- const driver = getDriver();
- await driver.wait(until.alertIsPresent(), timeout);
- const alert = await driver.switchTo().alert();
- await alert.dismiss();
- return {
- content: [{ type: 'text', text: 'Alert dismissed' }]
- };
- } catch (e) {
- return {
- content: [{ type: 'text', text: `Error dismissing alert: ${e.message}` }],
- isError: true
- };
- }
- }
-);
-
-server.tool(
- "get_alert_text",
- "gets the text content of a browser alert, confirm, or prompt dialog",
+ "alert",
+ "handles a browser alert, confirm, or prompt dialog",
{
- timeout: z.number().optional().describe("Maximum time to wait for alert in milliseconds")
+ action: z.enum(["accept", "dismiss", "get_text", "send_text"]).describe("Action to perform on the alert"),
+ text: z.string().optional().describe("Text to send (required for send_text)"),
+ timeout: z.number().optional().describe("Max wait in ms")
},
- async ({ timeout = 5000 }) => {
+ async ({ action, text, timeout = 5000 }) => {
try {
const driver = getDriver();
await driver.wait(until.alertIsPresent(), timeout);
- const alert = await driver.switchTo().alert();
- const text = await alert.getText();
- return {
- content: [{ type: 'text', text }]
- };
- } catch (e) {
- return {
- content: [{ type: 'text', text: `Error getting alert text: ${e.message}` }],
- isError: true
- };
- }
- }
-);
-
-server.tool(
- "send_alert_text",
- "types text into a browser prompt dialog and accepts it",
- {
- text: z.string().describe("Text to enter into the prompt"),
- timeout: z.number().optional().describe("Maximum time to wait for alert in milliseconds")
- },
- async ({ text, timeout = 5000 }) => {
- try {
- const driver = getDriver();
- await driver.wait(until.alertIsPresent(), timeout);
- const alert = await driver.switchTo().alert();
- await alert.sendKeys(text);
- await alert.accept();
- return {
- content: [{ type: 'text', text: `Text "${text}" sent to prompt and accepted` }]
- };
+ const alertObj = await driver.switchTo().alert();
+ switch (action) {
+ case 'accept':
+ await alertObj.accept();
+ return { content: [{ type: 'text', text: 'Alert accepted' }] };
+ case 'dismiss':
+ await alertObj.dismiss();
+ return { content: [{ type: 'text', text: 'Alert dismissed' }] };
+ case 'get_text': {
+ const alertText = await alertObj.getText();
+ return { content: [{ type: 'text', text: alertText }] };
+ }
+ case 'send_text': {
+ if (text === undefined) throw new Error('text is required for send_text action');
+ await alertObj.sendKeys(text);
+ await alertObj.accept();
+ return { content: [{ type: 'text', text: `Text "${text}" sent to prompt and accepted` }] };
+ }
+ default:
+ return { content: [{ type: 'text', text: `Unknown action: ${action}` }], isError: true };
+ }
} catch (e) {
return {
- content: [{ type: 'text', text: `Error sending text to alert: ${e.message}` }],
+ content: [{ type: 'text', text: `Error in alert ${action}: ${e.message}` }],
isError: true
};
}
@@ -1070,28 +771,38 @@ server.tool(
);
// BiDi Diagnostic Tools
-registerBidiTool(
- 'get_console_logs',
- 'returns browser console messages (log, warn, info, debug) captured via WebDriver BiDi. Useful for debugging page behavior, seeing application output, and catching warnings.',
- 'consoleLogs',
- 'No console logs captured',
- 'Console log capture is not available (BiDi not supported by this browser/driver)'
-);
-
-registerBidiTool(
- 'get_page_errors',
- 'returns JavaScript errors and exceptions captured via WebDriver BiDi. Includes stack traces when available. Essential for diagnosing why a page is broken or a feature isn\'t working.',
- 'pageErrors',
- 'No page errors captured',
- 'Page error capture is not available (BiDi not supported by this browser/driver)'
-);
+const diagnosticTypes = {
+ console: { logKey: 'consoleLogs', emptyMessage: 'No console logs captured' },
+ errors: { logKey: 'pageErrors', emptyMessage: 'No page errors captured' },
+ network: { logKey: 'networkLogs', emptyMessage: 'No network activity captured' }
+};
-registerBidiTool(
- 'get_network_logs',
- 'returns network activity (completed responses and failed requests) captured via WebDriver BiDi. Shows HTTP status codes, URLs, methods, and error details. Useful for diagnosing failed API calls and broken resources.',
- 'networkLogs',
- 'No network activity captured',
- 'Network log capture is not available (BiDi not supported by this browser/driver)'
+server.tool(
+ "diagnostics",
+ "retrieves browser diagnostics (console logs, JS errors, or network activity) captured via WebDriver BiDi",
+ {
+ type: z.enum(["console", "errors", "network"]).describe("Type of diagnostic data to retrieve"),
+ clear: z.boolean().optional().describe("Clear after returning (default: false)")
+ },
+ async ({ type, clear = false }) => {
+ try {
+ getDriver();
+ const bidi = state.bidi.get(state.currentSession);
+ if (!bidi?.available) {
+ return { content: [{ type: 'text', text: 'Diagnostics not available (BiDi not supported by this browser/driver)' }] };
+ }
+ const { logKey, emptyMessage } = diagnosticTypes[type];
+ const logs = bidi[logKey];
+ const result = logs.length === 0 ? emptyMessage : JSON.stringify(logs, null, 2);
+ if (clear) bidi[logKey] = [];
+ return { content: [{ type: 'text', text: result }] };
+ } catch (e) {
+ return {
+ content: [{ type: 'text', text: `Error getting diagnostics: ${e.message}` }],
+ isError: true
+ };
+ }
+ }
);
// Resources
diff --git a/test/bidi.test.mjs b/test/bidi.test.mjs
index 778f1dd..f071480 100644
--- a/test/bidi.test.mjs
+++ b/test/bidi.test.mjs
@@ -43,14 +43,14 @@ describe('BiDi Diagnostic Tools', () => {
});
it('should capture console messages at different levels', async () => {
- await client.callTool('get_console_logs', { clear: true });
+ await client.callTool('diagnostics', { type: 'console', clear: true });
- await client.callTool('click_element', { by: 'id', value: 'log-info' });
- await client.callTool('click_element', { by: 'id', value: 'log-warn' });
- await client.callTool('click_element', { by: 'id', value: 'log-error' });
+ await client.callTool('interact', { action: 'click', by: 'id', value: 'log-info' });
+ await client.callTool('interact', { action: 'click', by: 'id', value: 'log-warn' });
+ await client.callTool('interact', { action: 'click', by: 'id', value: 'log-error' });
await new Promise(r => setTimeout(r, 500));
- const result = await client.callTool('get_console_logs', {});
+ const result = await client.callTool('diagnostics', { type: 'console' });
assert.ok(!result.isError, `Tool returned error: ${getResponseText(result)}`);
const logs = JSON.parse(getResponseText(result));
@@ -67,10 +67,10 @@ describe('BiDi Diagnostic Tools', () => {
await client.callTool('execute_script', { script: 'console.log("clear-test");' });
await new Promise(r => setTimeout(r, 500));
- const clearResult = await client.callTool('get_console_logs', { clear: true });
+ const clearResult = await client.callTool('diagnostics', { type: 'console', clear: true });
assert.ok(getResponseText(clearResult).includes('clear-test'), 'Should return logs before clearing');
- const afterResult = await client.callTool('get_console_logs', {});
+ const afterResult = await client.callTool('diagnostics', { type: 'console' });
assert.strictEqual(getResponseText(afterResult), 'No console logs captured');
});
});
@@ -89,12 +89,12 @@ describe('BiDi Diagnostic Tools', () => {
});
it('should capture JavaScript errors with stack traces', async () => {
- await client.callTool('get_page_errors', { clear: true });
+ await client.callTool('diagnostics', { type: 'errors', clear: true });
await client.callTool('execute_script', {
script: 'setTimeout(() => { throw new Error("Intentional test error"); }, 0);'
});
await new Promise(r => setTimeout(r, 1000));
- const result = await client.callTool('get_page_errors', {});
+ const result = await client.callTool('diagnostics', { type: 'errors' });
assert.ok(!result.isError, `Tool returned error: ${getResponseText(result)}`);
const text = getResponseText(result);
const errors = JSON.parse(text);
@@ -118,14 +118,14 @@ describe('BiDi Diagnostic Tools', () => {
});
it('should capture successful and failed network requests', async () => {
- await client.callTool('get_network_logs', { clear: true });
+ await client.callTool('diagnostics', { type: 'network', clear: true });
await client.callTool('navigate', { url: fixture('bidi.html') });
await client.callTool('execute_script', {
script: 'fetch("http://localhost:1/nonexistent").catch(() => {});'
});
await new Promise(r => setTimeout(r, 1000));
- const result = await client.callTool('get_network_logs', {});
+ const result = await client.callTool('diagnostics', { type: 'network' });
assert.ok(!result.isError, `Tool returned error: ${getResponseText(result)}`);
const logs = JSON.parse(getResponseText(result));
@@ -152,7 +152,7 @@ describe('BiDi Diagnostic Tools', () => {
await client.callTool('execute_script', { script: 'console.log("session-1-log");' });
await new Promise(r => setTimeout(r, 500));
- const firstLogs = await client.callTool('get_console_logs', {});
+ const firstLogs = await client.callTool('diagnostics', { type: 'console' });
assert.ok(getResponseText(firstLogs).includes('session-1-log'));
await client.callTool('close_session', {});
@@ -161,7 +161,7 @@ describe('BiDi Diagnostic Tools', () => {
options: { headless: true, arguments: ['--no-sandbox'] }
});
- const newLogs = await client.callTool('get_console_logs', {});
+ const newLogs = await client.callTool('diagnostics', { type: 'console' });
assert.strictEqual(getResponseText(newLogs), 'No console logs captured');
await client.callTool('close_session', {});
diff --git a/test/fixtures/alerts.html b/test/fixtures/alerts.html
index 0926fc9..3f5360b 100644
--- a/test/fixtures/alerts.html
+++ b/test/fixtures/alerts.html
@@ -3,9 +3,10 @@
Alert Test Page
Alert Tests
-
+
+
diff --git a/test/fixtures/drag-drop.html b/test/fixtures/drag-drop.html
index 65c5487..09eb2b2 100644
--- a/test/fixtures/drag-drop.html
+++ b/test/fixtures/drag-drop.html
@@ -17,5 +17,28 @@
Drag me
Drop here
+
+
+