From a768cb29b57ee4ed93004bd0b040cf91b1fa1cce Mon Sep 17 00:00:00 2001 From: ziruihao Date: Fri, 5 Jun 2026 14:31:01 -0700 Subject: [PATCH 1/2] refactor(codegen): delete codegen.mjs; outer agent owns script generation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #125 introduced scripts/codegen.mjs as a one-shot completion-API pipeline that templates a framework prompt, calls the LLM, writes the emitted message text to disk as the script, then verifies and rewrites on failure. The sub-process boundary turned out to be the wrong contract: • Script content rides the model's natural-language output channel, so it competes with the model's conversational instincts. The LLM keeps prepending self-narration ("The error is clear:", "Here is the corrected script:") on the rewrite path, breaking tsx parse — see /tmp/skill/etsy.com/search-products/autobrowse/codegen-cache/ 6c78b599d4d5a9d4.txt from the 2026-06-04 preview run. • Multi-framework runs into a shared --out dir collide on package.json + node_modules (PR #125 fixed this with deep-merge + pkg-hash stamp; the bug only existed because of the sub-process split). • Runner timeouts and the parent verify timeout had to be hand-aligned so the parent doesn't SIGTERM a healthy child mid-install. • Trace/strategy/script artifacts get reasoned about in two places (codegen.mjs writes scripts, the outer agent's bash uploads them). All of those classes of bug disappear when the outer agent owns codegen. It already has the context, the tools (Read/Write/Bash), and the judgment loop. The Write tool's structured `content` argument means script bytes never ride the natural-language channel — no preamble bug. A single agent process means no cross-process timeout coordination, no deps merging across sub-process invocations, and no separate place to reason about "this stagehand failed, drop it before upload". Changes: - Delete scripts/codegen.mjs (515 lines) - Delete codegen/runners/ (tsx-runner.mjs, playwright.mjs, stagehand.mjs) - Delete codegen/scaffolds/ (inlined into the new reference docs) - Move + reframe codegen/prompts/{playwright,stagehand}.md to references/codegen/{playwright,stagehand}.md. The technical content (CDP attach pattern, Stagehand v3 constructor shape, locator priorities, snap convention, JSON stdout contract) is preserved; what changed is framing — these are now reference docs an outer agent reads on demand, not completion-API system prompts. - Update SKILL.md's "Generate a runnable script" section to describe the agent-driven loop (Read trace/refs → Write script → Bash verify → iterate or delete on persistent failure). Net diff: -626 lines. The companion change in browse.sh's §4b system prompt — replacing the `node codegen.mjs --frameworks ...` invocation with the inlined Read/Write/Bash loop — lives in a separate PR. Co-Authored-By: Claude Opus 4.7 (1M context) --- skills/autobrowse/SKILL.md | 78 ++- .../autobrowse/codegen/prompts/playwright.md | 55 -- .../autobrowse/codegen/prompts/stagehand.md | 80 --- .../codegen/runners/lib/tsx-runner.mjs | 136 ----- .../autobrowse/codegen/runners/playwright.mjs | 29 - .../autobrowse/codegen/runners/stagehand.mjs | 24 - .../codegen/scaffolds/playwright/package.json | 15 - .../scaffolds/playwright/tsconfig.json | 13 - .../codegen/scaffolds/stagehand/package.json | 15 - .../codegen/scaffolds/stagehand/tsconfig.json | 13 - .../references/codegen/playwright.md | 128 +++++ .../references/codegen/stagehand.md | 145 +++++ skills/autobrowse/scripts/codegen.mjs | 515 ------------------ 13 files changed, 310 insertions(+), 936 deletions(-) delete mode 100644 skills/autobrowse/codegen/prompts/playwright.md delete mode 100644 skills/autobrowse/codegen/prompts/stagehand.md delete mode 100644 skills/autobrowse/codegen/runners/lib/tsx-runner.mjs delete mode 100755 skills/autobrowse/codegen/runners/playwright.mjs delete mode 100755 skills/autobrowse/codegen/runners/stagehand.mjs delete mode 100644 skills/autobrowse/codegen/scaffolds/playwright/package.json delete mode 100644 skills/autobrowse/codegen/scaffolds/playwright/tsconfig.json delete mode 100644 skills/autobrowse/codegen/scaffolds/stagehand/package.json delete mode 100644 skills/autobrowse/codegen/scaffolds/stagehand/tsconfig.json create mode 100644 skills/autobrowse/references/codegen/playwright.md create mode 100644 skills/autobrowse/references/codegen/stagehand.md delete mode 100755 skills/autobrowse/scripts/codegen.mjs diff --git a/skills/autobrowse/SKILL.md b/skills/autobrowse/SKILL.md index ba7ca24..0a7b51b 100644 --- a/skills/autobrowse/SKILL.md +++ b/skills/autobrowse/SKILL.md @@ -225,47 +225,43 @@ Read the new summary. Did it pass? Make clear progress? ### Generate a runnable script (optional) -Once the task has converged, you can produce a deterministic, runnable script -in one or more frameworks via `scripts/codegen.mjs`. This is one shot of an -LLM call per framework, cached by content hash, with optional verify-against- -fresh-session and rewrite-on-failure. - -```bash -node ${CLAUDE_SKILL_DIR}/scripts/codegen.mjs \ - --task \ - --workspace ./autobrowse \ - --frameworks playwright,stagehand \ - --verify -``` - -Each framework gets its own subdirectory under `tasks///` -with the emitted script and a self-contained scaffold (`package.json`, -`tsconfig.json`). The directory is runnable standalone with -`cd tasks//playwright && npm install && npx tsx .ts` — the only -runtime requirement is `BROWSERBASE_API_KEY` (plus `ANTHROPIC_API_KEY` for -the Stagehand target). - -Builtin frameworks: `playwright`, `stagehand`. Add a custom framework with -`--prompt-template --frameworks custom` (and provide your own runner -or pass `--no-verify`). - -Common flags: - -| Flag | Purpose | -|---|---| -| `--frameworks a,b,...` | Comma-separated; default `playwright` | -| `--verify` / `--no-verify` | Run the produced script against a fresh BB session; default `--verify` | -| `--max-retries N` | Rewrite-on-verify-failure cap; default 2 | -| `--cache-only` | Error if cache miss (CI-friendly) | -| `--force` | Bust the cache | -| `--dry-run` | Estimate prompt size + cost; don't call the LLM | -| `--run ` | Force a specific `run-NNN` (default: latest passing) | - -Output is one JSON line per framework on stdout. Non-zero exit if any -selected framework's final state is `passed: false`. - -See `references/playwright-cdp-bridge.md` for the canonical -`connectOverCDP` patterns the emitted scripts follow. +Once the task has converged, you can produce a runnable script in one or +more frameworks (Playwright, Stagehand) directly using your own `Write` and +`Bash` tools — autobrowse no longer ships a separate `codegen.mjs` +sub-process. The framework-specific specs live as reference docs you read +on demand: + +- `references/codegen/playwright.md` — script shape, scaffold, verify + contract, locator priorities, HTTP-only variant +- `references/codegen/stagehand.md` — Stagehand v3 constructor, `act` / + `extract` patterns, when NOT to ship Stagehand +- `references/playwright-cdp-bridge.md` — canonical `connectOverCDP` + create-session / release dance + +The loop is: + +1. `Read` the converged trace at + `./autobrowse/traces//latest/{trace.json,unified-events.jsonl}`, + the task's `strategy.md`, and the framework reference doc. +2. `Write` `.ts` into the output directory (e.g. + `tasks///.ts` or a flattened upload root). +3. `Write` the scaffold's `package.json` + `tsconfig.json` per the + reference. When multiple frameworks share an output directory, merge + the `dependencies` across frameworks into a single `package.json`. +4. `Bash` `PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 npm install --silent --no-audit --no-fund` + then `npx tsx .ts` against a fresh Browserbase session. +5. Parse the trailing `{"success":boolean,...}` JSON line from stdout. If + it failed, read the stderr tail and iterate — up to ~3 attempts is + reasonable. If still failing, delete the broken script so it isn't + uploaded (the upload glob ships whatever's on disk). + +The agent does this directly because it already has the context, the +tools, and the judgment for "this stderr means …, try X". A sub-process +LLM call (the old `codegen.mjs`) couldn't see why a script was failing +beyond the stderr tail, and tended to bleed natural-language preamble +into the `.ts` file via the completion API's message channel — both +problems disappear when the outer agent writes the file through the +`Write` tool's structured argument. ### After all iterations — publish if ready diff --git a/skills/autobrowse/codegen/prompts/playwright.md b/skills/autobrowse/codegen/prompts/playwright.md deleted file mode 100644 index 51c7de7..0000000 --- a/skills/autobrowse/codegen/prompts/playwright.md +++ /dev/null @@ -1,55 +0,0 @@ -# Playwright codegen — system prompt - -You are converting a converged autobrowse trace into a runnable Playwright -script. Your output is the **complete contents of a `.ts` file**, nothing -else: no preamble, no closing remarks, no markdown fences. - -## Constraints - -- **Self-contained.** The script must run with only `BROWSERBASE_API_KEY` in - the environment. No reliance on autobrowse state, no reading from - workspace files. -- **CDP attach, never `chromium.launch()`.** Follow the - `Playwright ↔ Browserbase bridge` reference verbatim for the - create-session / connectOverCDP / release dance. -- **No `browser.close()`.** Release the session via - `browse cloud sessions update --status REQUEST_RELEASE` in `finally`. -- **Final stdout line is JSON.** `{"success":true,"data":...}` on success - or `{"success":false,"error":"..."}` on failure. The runner parses this - line — don't emit any other JSON-looking lines after it. -- **Snap on errors.** Wrap `main()` in `try { … } catch (err) { await snap(page, '99-error'); throw err; }`. Honor `process.env.SCREENSHOT_DIR` for snap output. -- **Locator preferences in order:** `data-testid` attribute → role + name → - id → text → xpath. Prefer Playwright's auto-waiting (`locator.click()`, - `locator.fill()`) over explicit waits when possible. -- **Use the descriptor data when available.** Each `descriptors.ndjson` entry - describes the actual DOM target the agent interacted with — pick locators - from those `attributes` / `role` / `accessibleName` fields rather than - inventing them. -- **Use the trace's network signals.** Where the unified events show a slow - XHR after an action, insert `page.waitForResponse(...)` rather than - arbitrary sleeps. - -## Output schema - -The script must define a Zod schema that mirrors the `# Output` section of -the task.md provided in context, and validate the extracted data through -that schema before printing the final `success: true` line. - -## Imports / runtime - -```typescript -import { chromium, type Browser, type Page } from "playwright"; -import { execFileSync } from "node:child_process"; -import { join } from "node:path"; -import { z } from "zod"; -import "dotenv/config"; -``` - -`playwright` and `zod` are already in the scaffolded `package.json`. Do not -add other dependencies. - -## What to emit - -Output the complete `.ts` file content. Start with imports, end with a call -to `main()`. Nothing before the first import, nothing after the last -closing brace. No markdown fences. diff --git a/skills/autobrowse/codegen/prompts/stagehand.md b/skills/autobrowse/codegen/prompts/stagehand.md deleted file mode 100644 index 085f7d3..0000000 --- a/skills/autobrowse/codegen/prompts/stagehand.md +++ /dev/null @@ -1,80 +0,0 @@ -# Stagehand codegen — system prompt - -You are converting a converged autobrowse trace into a runnable Stagehand -script. Your output is the **complete contents of a `.ts` file**, nothing -else: no preamble, no closing remarks, no markdown fences. - -This targets **Stagehand v3** (`@browserbasehq/stagehand` 3.x). The v3 API -differs from older examples — follow the patterns below exactly. - -## Constraints - -- **Self-contained.** The script must run with `BROWSERBASE_API_KEY` and - `ANTHROPIC_API_KEY` in the environment. -- **Stagehand owns its own Browserbase session.** Construct it with - `env: "BROWSERBASE"` and let it create the session — do NOT pre-create a - session via the `browse` CLI and do NOT pass `browserbaseSessionID`. The - constructor shape is: - ```typescript - const stagehand = new Stagehand({ - env: "BROWSERBASE", - apiKey: process.env.BROWSERBASE_API_KEY, // ← BROWSERBASE key (NOT the Anthropic key); project inferred from it - model: { // ← LLM config lives here, not at top level - modelName: "anthropic/claude-sonnet-4-6", // ← provider-prefixed; do not invent model names - apiKey: process.env.ANTHROPIC_API_KEY, - }, - }); - await stagehand.init(); - ``` - The top-level `apiKey` is the **Browserbase** API key (the project is - inferred from it — no `projectId` needed). There is no `browserbaseAPIKey` - field and no top-level `modelName` — using the Anthropic key as `apiKey` - makes session lookup fail with a 404. -- **Get the page from the context, not `stagehand.page`.** - ```typescript - const page = stagehand.context.pages()[0] ?? (await stagehand.context.newPage()); - await page.goto(url, { waitUntil: "domcontentloaded" }); - ``` - `page` supports `goto`, `waitForTimeout`, `waitForSelector`, `screenshot`. -- **`act` and `extract` are methods on the `stagehand` instance, not the page.** - - Actions: `await stagehand.act("click the Continue button")` - - Data: `await stagehand.extract("", zodSchema)` — pass the Zod - schema as the second argument; it returns the parsed object. - Prefer natural-language intent strings — the whole point of Stagehand is the - LLM picks the locator at runtime. -- **One natural-language action per `act` call.** Don't compound - ("click X and fill Y"); chain individual `act` calls so each is retryable. -- **Schema-backed extract.** Define Zod schemas mirroring the `# Output` - section of task.md and validate before emitting the final `success: true` - line. -- **Use the descriptors as natural-language hints.** Where a descriptor shows - `accessibleName: "Continue"`, the corresponding `act` should say - `"click the Continue button"`. Specific locators aren't required. -- **Snap on errors.** Wrap the body in - `try { … } catch (err) { await snap(page, '99-error'); … }`, honoring - `process.env.SCREENSHOT_DIR`. `snap` should be a no-op when the dir is unset. -- **Final stdout line is JSON.** `{"success":true,"data":...}` on success, - `{"success":false,"error":"..."}` on failure. The runner parses this — emit - no other JSON-looking lines after it. -- **Tear down with `await stagehand.close()` in `finally`.** Since Stagehand - created and owns the session, `close()` is the correct teardown — do NOT use - `browse cloud sessions update … REQUEST_RELEASE` (that's only for the - CDP-attach pattern where you created the session yourself). - -## Imports / runtime - -```typescript -import { Stagehand } from "@browserbasehq/stagehand"; -import { join } from "node:path"; -import { z } from "zod"; -import "dotenv/config"; -``` - -`@browserbasehq/stagehand` and `zod` are already in the scaffolded -`package.json`. Do not add other dependencies. - -## What to emit - -Output the complete `.ts` file content. Start with imports, end with a call -to `main()`. Nothing before the first import, nothing after the last -closing brace. No markdown fences. diff --git a/skills/autobrowse/codegen/runners/lib/tsx-runner.mjs b/skills/autobrowse/codegen/runners/lib/tsx-runner.mjs deleted file mode 100644 index c90407c..0000000 --- a/skills/autobrowse/codegen/runners/lib/tsx-runner.mjs +++ /dev/null @@ -1,136 +0,0 @@ -// tsx-runner.mjs — shared logic for codegen target runners that boot a tsx -// script in a scaffolded output dir and parse its trailing JSON line. -// -// Playwright and Stagehand runners (and any future TS target that follows the -// same {"success":boolean,"data":...} contract) call runTsxTarget with their -// per-framework tweaks: a label for stderr prefix, extra env (e.g. -// PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1), and an optional preflight check (e.g. -// "ANTHROPIC_API_KEY required for Stagehand"). - -import * as fs from "node:fs"; -import * as path from "node:path"; -import * as crypto from "node:crypto"; -import { spawnSync } from "node:child_process"; - -export function getArg(name) { - const i = process.argv.indexOf(`--${name}`); - return i !== -1 && process.argv[i + 1] ? process.argv[i + 1] : null; -} - -// Emit a JSON result line on stdout and exit. Centralized so the contract -// (single {passed:bool,...} JSON line, exit 0/2) is consistent across runners. -function emitAndExit(result) { - console.log(JSON.stringify(result)); - process.exit(result.passed ? 0 : 2); -} - -/** - * Run a tsx target script against a fresh BB session. - * - * @param {object} opts - * @param {string} opts.label stderr prefix, e.g. "playwright" - * @param {Record} [opts.extraEnv] merged into the run's env - * @param {Record} [opts.installEnv] merged into npm install's env - * @param {() => string|null} [opts.preflight] return error message to fail fast - */ -export function runTsxTarget(opts) { - const { label, extraEnv = {}, installEnv = {}, preflight } = opts; - const outDir = getArg("out-dir"); - const script = getArg("script"); - - if (!outDir || !script) { - emitAndExit({ passed: false, error: "runner missing --out-dir or --script" }); - } - - const scriptPath = path.join(outDir, script); - if (!fs.existsSync(scriptPath)) { - emitAndExit({ passed: false, error: `script not found at ${scriptPath}` }); - } - - if (preflight) { - const err = preflight(); - if (err) emitAndExit({ passed: false, error: err }); - } - - // Install deps when package.json changes. Gating purely on node_modules - // existing is wrong when two frameworks share an --out dir: framework #2's - // dropScaffold merges its deps into the existing package.json, but the - // node_modules from framework #1's install is still missing them. We hash - // package.json and compare against a stamp under node_modules/ to detect - // that and re-install. - const pkgPath = path.join(outDir, "package.json"); - const stampPath = path.join(outDir, "node_modules", ".codegen-pkg-hash"); - const pkgHash = fs.existsSync(pkgPath) - ? crypto.createHash("sha256").update(fs.readFileSync(pkgPath)).digest("hex") - : null; - const stampedHash = fs.existsSync(stampPath) - ? fs.readFileSync(stampPath, "utf-8").trim() - : null; - if (pkgHash && pkgHash !== stampedHash) { - process.stderr.write(`[runner.${label}] installing deps in ${outDir}\n`); - // Always set PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 here, regardless of which - // runner we are. In shared --out mode, framework #2 (e.g. stagehand) gets - // playwright merged into its package.json by dropScaffold, so even runners - // that don't list playwright in installEnv would still trigger its - // postinstall and try to fetch hundreds of MB of chromium — exhausting - // the 3min install budget. We never need bundled browsers (always CDP). - const install = spawnSync("npm", ["install", "--silent", "--no-audit", "--no-fund"], { - cwd: outDir, - stdio: ["ignore", "inherit", "inherit"], - env: { ...process.env, PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD: "1", ...installEnv }, - timeout: 3 * 60 * 1000, - }); - if (install.status !== 0) { - emitAndExit({ passed: false, error: `npm install exited ${install.status}` }); - } - try { - fs.mkdirSync(path.dirname(stampPath), { recursive: true }); - fs.writeFileSync(stampPath, pkgHash); - } catch {} - } - - // Per-run screenshot dir, exposed to the script via SCREENSHOT_DIR so its - // snap() helper can write progress / failure shots somewhere we can find. - const screenshotDir = path.join(outDir, "screenshots", `verify-${Date.now()}`); - fs.mkdirSync(screenshotDir, { recursive: true }); - - process.stderr.write(`[runner.${label}] running ${scriptPath}\n`); - const run = spawnSync("npx", ["tsx", script], { - cwd: outDir, - encoding: "utf-8", - stdio: ["ignore", "pipe", "pipe"], - env: { ...process.env, ...extraEnv, SCREENSHOT_DIR: screenshotDir }, - timeout: 5 * 60 * 1000, - }); - - const stdout = run.stdout ?? ""; - const stderr = run.stderr ?? ""; - - // Parse the script's trailing JSON line — walk backward through lines and - // take the last one that parses as JSON with a boolean `success` field. - let parsed = null; - const lines = stdout.trim().split("\n").filter(Boolean); - for (let i = lines.length - 1; i >= 0; i--) { - try { - const candidate = JSON.parse(lines[i]); - if (typeof candidate?.success === "boolean") { - parsed = candidate; - break; - } - } catch {} - } - - const passed = run.status === 0 && parsed?.success === true; - const result = { - passed, - exit_code: run.status, - script_output: parsed, - screenshot_dir: screenshotDir, - stderr_tail: stderr.slice(-2000), - }; - if (!passed) { - result.error = parsed?.error - || (run.status !== 0 ? `script exited ${run.status}` : "script did not emit success:true"); - } - emitAndExit(result); -} diff --git a/skills/autobrowse/codegen/runners/playwright.mjs b/skills/autobrowse/codegen/runners/playwright.mjs deleted file mode 100755 index dbc2f4e..0000000 --- a/skills/autobrowse/codegen/runners/playwright.mjs +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env node - -/** - * playwright.mjs — Runner for the Playwright codegen target. - * - * Invoked by codegen.mjs's verify step. Installs the scaffolded deps if - * needed, spawns `npx tsx