browserbase · shubh24 · Jun 27, 2026 · Jun 27, 2026 · Jun 27, 2026
diff --git a/README.md b/README.md
@@ -25,6 +25,7 @@ This plugin includes the following skills (see `skills/` for details):
 | [company-research](skills/company-research/SKILL.md) | Discover target companies matching your ICP using the Browserbase Search API, deep-research each one, and score fit into a research report and CSV |
 | [event-prospecting](skills/event-prospecting/SKILL.md) | Extract speakers from a conference page, filter their companies against your ICP, and deep-research the best-fit people into a person-first prospecting report |
 | [competitor-analysis](skills/competitor-analysis/SKILL.md) | Auto-discover a company's competitors via the Browserbase Search API, deep-research each across marketing, signal, benchmark, and strategic-diff lanes, and compile a browsable HTML report with an overview, per-competitor deep dives, a feature/pricing matrix, and a mentions feed |
+| [browser-record](skills/browser-record/SKILL.md) | Record a human browser flow (clicks, typing, screenshots, full CDP trace) on a Browserbase session, then let an agent distill what the human *meant* — collapsing corrections, dropping abandoned actions — into a reusable, parameterized task skill that replays against the live page |
 
 ## Installation
 

diff --git a/skills/browser-record/LICENSE.txt b/skills/browser-record/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2026 Browserbase, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/skills/browser-record/SKILL.md b/skills/browser-record/SKILL.md
@@ -0,0 +1,130 @@
+---
+name: browser-record
+description: Record a human browser flow on a Browserbase session and distill it into a reusable, parameterized task skill. Captures clicks/typing/screenshots (plus an optional full CDP trace), then an agent reasons about what the human *meant* — collapsing corrections, dropping abandoned actions — and writes an intent-level SKILL.md that replays against the live page. Use for "show, don't prompt": record a flow once and turn it into a skill. Triggers on "record this flow", "turn this into a skill", "record a browser workflow", "browser record".
+compatibility: "Requires Node 18+ and the browse CLI (`npm install -g browse`), plus `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID`. Record uses `@browserbasehq/sdk` + `playwright-core` — run `npm install` in this skill dir. Pairs with the `browser-trace` skill for the full CDP firehose."
+license: MIT
+allowed-tools: Bash, Read, Grep
+---
+
+# Browser Record
+
+"Show the bug instead of prompting it." Record a human flow once, then turn it
+into a **reusable, parameterized task skill** an agent can replay against the live
+page.
+
+The pipeline is **capture wide, reason narrow**:
+
+```
+record (interaction stream + screenshots)              ← semantic spine
+  + browser-trace (CDP firehose: network/console/DOM)  ← full observability
+  → distill = teacher agent reasons about INTENT       ← collapses corrections,
+  → skills/<task>/SKILL.md                                drops abandoned actions
+```
+
+The key idea: a recording is **mechanics** ("typed 'new yo', clicked `#c307`").
+What you want is **intent** ("destination = New York"). Recovering intent —
+including spotting that the user typed San Francisco, erased it, and chose Los
+Angeles, or applied a filter then removed it — is a judgment, so the distiller is
+**an agent, not a script** (see `references/distill.md`).
+
+## 1. Capture
+
+Record produces the **semantic spine**: each click/type with the acted element's
+accessible `name` + `role` + committed value, plus a screenshot per step.
+
+```bash
+RR_URL="https://www.saucedemo.com" RR_OUT=/tmp/rec.json RR_TITLE="login flow" \
+  node --env-file=.env scripts/record.mjs
+```
+
+Open the printed **live view URL**, perform the flow, then stop with ENTER,
+`touch /tmp/rr-stop`, or `RR_SECONDS=30`. Output: `RR_OUT` + `<RR_OUT>-shots/`.
+
+**For full observability**, attach `browser-trace` so the teacher agent can also
+query network/console/DOM. Create one keep-alive session, point both at it:
+
+```bash
+node ../browser-trace/scripts/bb-capture.mjs --new myflow   # session + CDP firehose
+SID=$(jq -r .browserbase.session_id .o11y/myflow/manifest.json)
+CONNECT_URL=$(browse cloud sessions get "$SID" | jq -r .connectUrl)
+RR_CONNECT_URL="$CONNECT_URL" RR_URL="https://site.com" RR_OUT=/tmp/rec.json \
+  node --env-file=.env scripts/record.mjs                   # attaches to same session
+# after stopping the recording:
+node ../browser-trace/scripts/stop-capture.mjs myflow && node ../browser-trace/scripts/bisect-cdp.mjs myflow
+```
+
+| Var | Default | Meaning |
+|-----|---------|---------|
+| `RR_URL` | `https://example.com` | start URL |
+| `RR_OUT` | `/tmp/recording-<ts>.json` | output recording path |
+| `RR_CONNECT_URL` | _(none)_ | attach to an existing session (e.g. browser-trace's) instead of creating one |
+| `RR_TITLE` / `RR_STOP` / `RR_SECONDS` | — | title / stop-file / auto-stop |
+
+## 2. Distill (the agent does this)
+
+Read `references/distill.md`, then **act as the teacher agent**: read
+`recording.json` + the screenshots, query the `browser-trace` buckets as needed,
+and reconstruct the *smallest set of intents that explains the session* —
+collapsing corrections, dropping abandoned/undone actions, parameterizing the
+values the user supplied. Write the result as `skills/<task>/`.
+
+Each step's headline is the value the field **committed to** (the acted element's
+`name`), never the keystrokes or a dynamic selector. The committed value is also
+the step's verification check.
+
+### What the generated task skill must contain
+
+- `SKILL.md` — intent steps (shape below).
+- `screenshots/NN-<label>.png` — the committed-state shot for each intent step,
+  curated from the recording and referenced per step. This is the visual oracle.
+- `recording.json` — the raw mechanics, last-resort fallback only.
+
+### Task skill shape
+
+Each step states the **intent**, names the **recorded target** (the element's
+accessible name/role, and its selector if useful) as a *hint*, explicitly grants
+the agent **agency to use whatever element achieves the intent**, points at the
+screenshot, and gives a verification check.
+
+```markdown
+---
+name: <task>
+description: <what it does + when to fire, with triggers>
+license: MIT
+---
+# <Task>
+Realize each intent against the live UI — do NOT replay keystrokes or dynamic
+selectors. The "recorded target" is a hint; if the live page differs, use any
+element that achieves the intent. Verify each step.
+
+Inputs: origin, destination, depart
+
+1. Set destination = {destination}.
+   Recorded target: combobox "Where to?" (aria/Where to?) → suggestion option.
+   See screenshots/03-destination.png · ✅ field reads {destination}, not "Anywhere".
+...
+Fallback: screenshots/ (oracle) · recording.json (raw mechanics, last resort)
+```
+
+## 3. Replay
+
+Replay = **invoke the generated task skill** like any skill (a natural-language
+request that matches its triggers). The agent realizes each intent via `browse`,
+using the per-step screenshots as the oracle and verifying committed values.
+Because it replays *intent*, it survives dynamic-id churn and minor layout change.
+
+## Recording shape
+
+```json
+{
+  "title": "login flow",
+  "startUrl": "https://www.saucedemo.com",
+  "shots": "/tmp/rec-shots",
+  "steps": [
+    { "type": "navigate", "url": "https://www.saucedemo.com" },
+    { "type": "change", "name": "Username", "role": "textbox", "value": "standard_user",
+      "selectors": [["aria/Username"], ["#user-name"]], "screenshot": "/tmp/rec-shots/step-02.png" },
+    { "type": "click", "name": "Login", "role": "button", "selectors": [["text/Login"], ["#login-button"]] }
+  ]
+}
+```
diff --git a/skills/browser-record/evals/evals.json b/skills/browser-record/evals/evals.json
@@ -0,0 +1,17 @@
+{
+  "skill": "browser-record",
+  "evals": [
+    {
+      "prompt": "Record my flight search on Google Flights and turn it into a reusable skill.",
+      "expected": "Runs scripts/record.mjs (ideally attached to a browser-trace session via RR_CONNECT_URL), surfaces the live-view URL for the user to perform the flow, and saves the interaction stream + per-step screenshots."
+    },
+    {
+      "prompt": "Now distill that recording into a task skill.",
+      "expected": "Acts as the teacher agent per references/distill.md: reads recording.json + screenshots, queries the CDP trace as needed, recovers intent (committed values, not keystrokes), and writes skills/<task>/ with a parameterized SKILL.md, a curated screenshots/ folder referenced per step, and recording.json as fallback."
+    },
+    {
+      "prompt": "I typed San Francisco, deleted it, then typed Los Angeles — make sure the skill only does Los Angeles.",
+      "expected": "Explains that the teacher-agent distiller collapses self-corrections and drops abandoned actions, so the emitted skill sets origin = Los Angeles only; the intermediate San Francisco state is omitted as noise."
+    }
+  ]
+}
diff --git a/skills/browser-record/package.json b/skills/browser-record/package.json
@@ -0,0 +1,10 @@
+{
+  "name": "browser-record",
+  "version": "0.1.0",
+  "private": true,
+  "type": "module",
+  "dependencies": {
+    "@browserbasehq/sdk": "^2.7.0",
+    "playwright-core": "^1.49.0"
+  }
+}
diff --git a/skills/browser-record/references/distill.md b/skills/browser-record/references/distill.md
@@ -0,0 +1,62 @@
+# Distill: the teacher agent
+
+The distiller is **an agent, not a script.** Reconstructing what a human *meant*
+from what they *did* is a goal-level judgment — collapsing self-corrections,
+dropping abandoned actions, parameterizing variables — and no deterministic rule
+can do it. This is the same shape as the `autobrowse` teacher loop: there the
+outer agent reads its own run's trace and improves a skill; here it reads a
+*human's* trace and authors one.
+
+## Inputs (capture wide, read selectively)
+
+Give the agent everything the session produced, but let it **query** the trace
+rather than dumping the firehose into context (that's what the bisected buckets
+are for — progressive disclosure):
+
+| source | what it carries | how to read it |
+|--------|-----------------|----------------|
+| `recording.json` | semantic spine: each click/type with the acted element's `name` + `role` + value | read in full (it's small) |
+| `<recording>-shots/step-NN.png` | what the page looked like at each commit | read the ones you need to disambiguate intent |
+| `browser-trace` buckets (`.o11y/<run>/cdp/by-bucket/`) | network, console, DOM dumps, exact event timing | `grep`/`jq`/`query.mjs` on demand — e.g. to confirm a click triggered a request, or that a value committed |
+
+## The job
+
+Produce the **smallest set of intents that explains the session**, then write a
+parameterized task skill. Specifically:
+
+1. **Recover intent, not mechanics.** A step's headline is the value the field
+   *committed to* — read from the acted element's `name` (e.g. the autocomplete
+   suggestion "New York"), not the keystrokes ("new yo") or the dynamic selector
+   (`#c307`).
+2. **Collapse self-corrections.** Typed "San Francisco", cleared it, typed "Los
+   Angeles" → one intent: `origin = Los Angeles`. The intermediate states are noise.
+3. **Drop abandoned actions.** Applied a "window seat" filter then removed it →
+   net zero, omit it entirely. Same for opened-then-closed menus, mis-clicks.
+4. **Parameterize.** The values the user supplied (cities, dates, search terms)
+   become inputs with the recorded value as the example. Structural choices
+   (which button submits) stay fixed.
+5. **Attach a check per step.** The committed value *is* the assertion ("the field
+   reads New York"); for steps with no readable value, point at the step screenshot.
+
+## Output
+
+Write `skills/<task-name>/`:
+- `SKILL.md` — intent-level, parameterized, per-step verification. Each step names
+  the **recorded target** (accessible name/role, plus selector if useful) as a
+  *hint*, and explicitly grants the agent agency to use whatever live element
+  achieves the intent — never bind it to a dynamic id.
+- `screenshots/NN-<label>.png` — the committed-state shot for each intent step,
+  curated from the recording and referenced per step. The visual oracle.
+- `recording.json` — the raw mechanics, carried as a last-resort fallback.
+
+## Teacher prompt (starting point)
+
+> You are distilling a recorded browser session into a reusable task skill. You
+> have `recording.json` (semantic click/type stream with element names), the
+> per-step screenshots, and a queryable `browser-trace` (network/console/DOM).
+> Figure out what the human was *trying to accomplish* — not the literal keystrokes.
+> Collapse corrections, drop abandoned/undone actions, and identify which values
+> were user inputs (parameterize them). Emit a parameterized SKILL.md whose steps
+> are intents with a verification check each. When a step is ambiguous, look at its
+> screenshot and query the trace before deciding. Prefer the fewest steps that
+> reliably reproduce the goal.
diff --git a/skills/browser-record/scripts/inject.js b/skills/browser-record/scripts/inject.js
@@ -0,0 +1,112 @@
+// Injected into every page/frame of the recording session (via addInitScript).
+// Captures human interactions as SEMANTIC steps (not raw x/y) and buffers them in
+// window.__rr_events (mirrored to localStorage so they survive same-origin
+// navigations). The Node side polls + drains this buffer via page.evaluate.
+// (We avoid page.exposeBinding because it does not wire up over Browserbase CDP.)
+(() => {
+  if (window.__rr_installed) return;
+  window.__rr_installed = true;
+  const KEY = '__rr_buf';
+
+  // restore anything buffered before a navigation
+  window.__rr_events = window.__rr_events || (() => {
+    try { return JSON.parse(localStorage.getItem(KEY) || '[]'); } catch { return []; }
+  })();
+
+  const send = (ev) => {
+    window.__rr_events.push(ev);
+    try { localStorage.setItem(KEY, JSON.stringify(window.__rr_events)); } catch (_) {}
+  };
+  const now = () => Date.now();
+  const esc = (s) => { try { return CSS.escape(s); } catch { return s; } };
+
+  function cssPath(el) {
+    if (!(el instanceof Element)) return '';
+    const parts = [];
+    while (el && el.nodeType === 1 && parts.length < 6) {
+      if (el.id) { parts.unshift('#' + esc(el.id)); break; }
+      let nth = 1, sib = el;
+      while ((sib = sib.previousElementSibling)) if (sib.nodeName === el.nodeName) nth++;
+      parts.unshift(el.nodeName.toLowerCase() + ':nth-of-type(' + nth + ')');
+      el = el.parentElement;
+    }
+    return parts.join(' > ');
+  }
+
+  function xPath(el) {
+    if (el.id) return '//*[@id="' + el.id + '"]';
+    const parts = [];
+    while (el && el.nodeType === 1) {
+      let i = 1, sib = el;
+      while ((sib = sib.previousElementSibling)) if (sib.nodeName === el.nodeName) i++;
+      parts.unshift(el.nodeName.toLowerCase() + '[' + i + ']');
+      el = el.parentElement;
+    }
+    return '/' + parts.join('/');
+  }
+
+  function accName(el) {
+    const g = (a) => (el.getAttribute && el.getAttribute(a)) || '';
+    return (g('aria-label') || g('placeholder') || g('name') || g('title') || '').trim();
+  }
+
+  // The INTENT signal: the human-meaningful name of what was acted on, recovered
+  // ungated (not limited to certain tags) so an autocomplete suggestion ("New
+  // York") is captured even when its only selector is a dynamic id. Priority:
+  // explicit aria > labelledby > placeholder/title/alt > value > visible text.
+  function nameOf(el) {
+    const g = (a) => (el.getAttribute && el.getAttribute(a)) || '';
+    let lbl = '';
+    const lb = g('aria-labelledby');
+    if (lb) lbl = lb.split(/\s+/).map((id) => (document.getElementById(id) || {}).innerText || '').join(' ').trim();
+    const text = (el.innerText || el.textContent || '').replace(/\s+/g, ' ').trim();
+    const cand = g('aria-label') || lbl || g('placeholder') || g('title') || g('alt')
+      || (el.tagName === 'INPUT' ? el.value : '') || text;
+    return (cand || '').slice(0, 120);
+  }
+  function roleOf(el) {
+    return ((el.getAttribute && el.getAttribute('role')) || el.tagName || '').toLowerCase();
+  }
+
+  // Chrome DevTools Recorder format: selectors is an array of selector-groups,
+  // tried in priority order during replay. This list IS the healing.
+  function selectorsFor(el) {
+    const out = [];
+    if (el.id) out.push('#' + esc(el.id));
+    const an = accName(el);
+    if (an) out.push('aria/' + an.slice(0, 80));
+    const txt = (el.innerText || el.textContent || '').trim();
+    if (txt && txt.length <= 60 && ['BUTTON', 'A', 'SUMMARY', 'LABEL', 'SPAN'].includes(el.tagName)) {
+      out.push('text/' + txt);
+    }
+    out.push(cssPath(el));
+    out.push('xpath/' + xPath(el));
+    return out.filter(Boolean).map((s) => [s]);
+  }
+
+  document.addEventListener('click', (e) => {
+    const el = e.target;
+    if (!el || el.nodeType !== 1) return;
+    send({ type: 'click', name: nameOf(el), role: roleOf(el), selectors: selectorsFor(el), url: location.href, ts: now() });
+  }, true);
+
+  // 'change' fires on commit/blur -> captures the final field value, not keystrokes.
+  document.addEventListener('change', (e) => {
+    const el = e.target;
+    if (!el || el.nodeType !== 1) return;
+    const value = ('value' in el) ? el.value : '';
+    send({ type: 'change', name: nameOf(el), role: roleOf(el), selectors: selectorsFor(el), value, url: location.href, ts: now() });
+  }, true);
+
+  document.addEventListener('keydown', (e) => {
+    if (['Enter', 'Tab', 'Escape'].includes(e.key)) {
+      send({ type: 'keyDown', key: e.key, url: location.href, ts: now() });
+    }
+  }, true);
+
+  let st;
+  window.addEventListener('scroll', () => {
+    clearTimeout(st);
+    st = setTimeout(() => send({ type: 'scroll', x: window.scrollX, y: window.scrollY, url: location.href, ts: now() }), 400);
+  }, true);
+})();