diff --git a/README.md b/README.md index e96769e..71488ae 100644 --- a/README.md +++ b/README.md @@ -11,11 +11,11 @@ integration PRs, follows `main` through production, and pauses after failures. ## Install -Install the reviewed `v0.2.12` source commit directly from GitHub: +Install the reviewed `v0.2.13` source commit directly from GitHub: ```bash python3 -m pip install \ - 'deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@01c8c6e48c3a92155803cd4232b56b0c1d3363c2' + 'deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@992048a90e1db410b9197fe056c591d91c1b2019' deploybot init ``` @@ -95,7 +95,7 @@ worker can dispatch deployment when GitHub suppresses the `workflow_run` event for token-dispatched CI. Pin the Action to the full reviewed release commit: ```yaml -- uses: Forward-Future/DeployBot@01c8c6e48c3a92155803cd4232b56b0c1d3363c2 +- uses: Forward-Future/DeployBot@992048a90e1db410b9197fe056c591d91c1b2019 ``` The Action uses GitHub's built-in workflow token. GitHub intentionally does not @@ -190,6 +190,14 @@ deploybot resume and emits a new wake-up event. `follow` tracks newer cumulative `main` revisions until exact CI, deployment, and optional HTTP checks pass. A CI or deploy failure can pause further merges until `deploybot unpause`. +Before presenting an unpause request, adapters must refresh `deploybot status +--json` and suppress stale prompts when the durable controller is already +running or the release advanced. The original deploy instruction authorizes the +coordinator to unpause the matching failed release after its elected repair +head passes fresh checks and review. Pass that status result's failed main SHA +and unique `control_id` to `deploybot unpause --sha SHA --control-id ID` so a +concurrent newer pause remains authoritative. Rollback, +bypass, and mismatched recovery still require explicit user direction. Before starting an exact-main recovery, an agent runs `deploybot claim-release-repair --provider CLIENT --thread-id ID`. A @@ -323,7 +331,7 @@ deploybot integrate [--all] deploybot follow [--timeout SECONDS] [--poll SECONDS] [--json] deploybot metrics --json deploybot pause --reason "main CI failed" -deploybot unpause +deploybot unpause --sha FAILED_MAIN_SHA --control-id PAUSE_CONTROL_ID deploybot block [PR] --reason "..." deploybot unblock [PR] deploybot resume [PR] diff --git a/adapters/claude-code/.claude-plugin/plugin.json b/adapters/claude-code/.claude-plugin/plugin.json index 810ed19..61faa61 100644 --- a/adapters/claude-code/.claude-plugin/plugin.json +++ b/adapters/claude-code/.claude-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "deploybot", - "version": "0.2.12", + "version": "0.2.13", "description": "DeployBot: a provider-neutral GitHub merge queue for coding agents", "author": { "name": "DeployBot contributors" diff --git a/adapters/claude-code/.mcp.json b/adapters/claude-code/.mcp.json index 49ebaf9..7c4e77e 100644 --- a/adapters/claude-code/.mcp.json +++ b/adapters/claude-code/.mcp.json @@ -4,7 +4,7 @@ "command": "uvx", "args": [ "--from", - "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@01c8c6e48c3a92155803cd4232b56b0c1d3363c2", + "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@992048a90e1db410b9197fe056c591d91c1b2019", "deploybot-mcp" ] } diff --git a/adapters/claude-code/skills/deploybot/SKILL.md b/adapters/claude-code/skills/deploybot/SKILL.md index cf88e38..f79f2f6 100644 --- a/adapters/claude-code/skills/deploybot/SKILL.md +++ b/adapters/claude-code/skills/deploybot/SKILL.md @@ -79,6 +79,21 @@ Use `diagnose`/`deploybot doctor` for setup drift and `delivery_metrics` for p50 p95, and slow-stage evidence. A failed cumulative CI or deployment pauses the controller; only a designated coordinator may unpause after recovery. +Immediately before telling the user that the pipeline is paused or asking them +to `unpause`, re-read `pipeline_status` or run `deploybot status --json`. Treat +that fresh durable state as authoritative. If the controller is already +running or the release has advanced, do not repeat a stale action request; +continue coordinating or report the current gate. + +The original `deploy` instruction already authorizes a designated coordinator +to run `deploybot unpause --sha --control-id ` +for the matching failed release when the elected repair head has fresh +required checks and review, the pause reason still names that release, and no +rollback or gate waiver is involved. Revalidate status, unpause, then continue +the merge and release without asking for another user message. Ask the user +only when recovery is unresolved, ownership or SHA does not match, or the next +step requires a rollback, bypass, or expanded authority. + Before opening or editing an exact-main recovery PR, call `claim_release_repair` with the native provider and thread ID. Work only when it returns `owned`, using its deterministic branch. If it returns `claimed`, the diff --git a/adapters/claude-code/skills/manage-merge-queue/SKILL.md b/adapters/claude-code/skills/manage-merge-queue/SKILL.md index 8d92cb7..e1e2c4f 100644 --- a/adapters/claude-code/skills/manage-merge-queue/SKILL.md +++ b/adapters/claude-code/skills/manage-merge-queue/SKILL.md @@ -29,6 +29,15 @@ returned `owned` thread may use the deterministic repair branch. Respect the maximum batch size and keep new merges closed while an earlier release is unfinished. +Immediately before asking the user to `unpause` or take another repair action, +call `pipeline_status` again. Never show a stale pause prompt when durable state +is already `running` or the release has advanced. The original `deploy` +instruction authorizes the coordinator to unpause the matching failed release +after the elected repair head passes fresh checks and review, provided the pause +reason still matches and no rollback or gate waiver is needed. In that case, +run `deploybot unpause --sha --control-id ` and +continue without asking the user to repeat authorization. + When `follow_release` returns `thread_notifications`, send each supplied message to its native source thread. The source thread calls `acknowledge_thread_deployment` with the matching `notification_id`. Present the diff --git a/adapters/codex/agent-merge-queue/.codex-plugin/plugin.json b/adapters/codex/agent-merge-queue/.codex-plugin/plugin.json index 1dcbe9f..9bc4a48 100644 --- a/adapters/codex/agent-merge-queue/.codex-plugin/plugin.json +++ b/adapters/codex/agent-merge-queue/.codex-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "deploybot", - "version": "0.2.12", + "version": "0.2.13", "description": "Coordinate exact-head pull requests through verified deployment and thread notification", "author": { "name": "DeployBot contributors" diff --git a/adapters/codex/agent-merge-queue/skills/deploybot/SKILL.md b/adapters/codex/agent-merge-queue/skills/deploybot/SKILL.md index 3d9f285..0d81d66 100644 --- a/adapters/codex/agent-merge-queue/skills/deploybot/SKILL.md +++ b/adapters/codex/agent-merge-queue/skills/deploybot/SKILL.md @@ -76,6 +76,21 @@ Use `deploybot doctor --json` for setup drift and `deploybot metrics --json` for p50, p95, and slow-stage evidence. A failed cumulative CI or deployment pauses the controller; only a designated coordinator may unpause after recovery. +Immediately before telling the user that the pipeline is paused or asking them +to `unpause`, run `deploybot status --json` again. Treat that fresh durable +state as authoritative. If the controller is already running or the release +has advanced, do not repeat a stale action request; continue coordinating or +report the current gate. + +The original `deploy` instruction already authorizes a designated coordinator +to run `deploybot unpause --sha --control-id ` +for the matching failed release when the elected +repair head has fresh required checks and review, the pause reason still names +that release, and no rollback or gate waiver is involved. Revalidate status, +unpause, then continue the merge and release without asking for another user +message. Ask the user only when recovery is unresolved, ownership or SHA does +not match, or the next step requires a rollback, bypass, or expanded authority. + Before opening or editing an exact-main recovery PR, run `deploybot claim-release-repair` with the native provider and thread ID. Work only when it returns `owned`, using its deterministic branch. If it returns `claimed`, the diff --git a/adapters/codex/agent-merge-queue/skills/manage-merge-queue/SKILL.md b/adapters/codex/agent-merge-queue/skills/manage-merge-queue/SKILL.md index a8d0937..aba3a34 100644 --- a/adapters/codex/agent-merge-queue/skills/manage-merge-queue/SKILL.md +++ b/adapters/codex/agent-merge-queue/skills/manage-merge-queue/SKILL.md @@ -30,6 +30,16 @@ only the returned `owned` thread may use the deterministic repair branch. Respec maximum batch size and keep new merges closed while an earlier release is unfinished. +Immediately before asking the user to `unpause` or take another repair action, +run `deploybot status --json` again. Never show a stale pause prompt when +durable state is already `running` or the release has advanced. The original +`deploy` instruction authorizes the coordinator to run `deploybot unpause +--sha --control-id ` for +the matching failed release after the elected repair head passes fresh checks +and review, provided the pause reason still matches and no rollback or gate +waiver is needed. In that case, unpause and continue without asking the user to +repeat authorization. + When `deploybot follow --json` returns `thread_notifications`, send each supplied message to its native source thread. In Codex use `send_message_to_thread`; the source thread runs `deploybot thread acknowledge` with the matching diff --git a/adapters/cursor/.cursor/mcp.json b/adapters/cursor/.cursor/mcp.json index 49ebaf9..7c4e77e 100644 --- a/adapters/cursor/.cursor/mcp.json +++ b/adapters/cursor/.cursor/mcp.json @@ -4,7 +4,7 @@ "command": "uvx", "args": [ "--from", - "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@01c8c6e48c3a92155803cd4232b56b0c1d3363c2", + "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@992048a90e1db410b9197fe056c591d91c1b2019", "deploybot-mcp" ] } diff --git a/adapters/cursor/.cursor/rules/deploybot.mdc b/adapters/cursor/.cursor/rules/deploybot.mdc index fdeec95..4039a71 100644 --- a/adapters/cursor/.cursor/rules/deploybot.mdc +++ b/adapters/cursor/.cursor/rules/deploybot.mdc @@ -15,6 +15,14 @@ the stable Cursor thread ID, never prompts or transcripts. Refresh intent only after replacement-head review. Only the coordinator may react, integrate, drain, follow, pause, or resume repaired work. +Immediately before asking the user to unpause or take repair action, call +`pipeline_status` again and suppress the request if durable state is already +running or the release advanced. The original deploy instruction authorizes the +coordinator to unpause the matching failed release after its elected repair head +passes fresh checks and review, unless recovery requires a rollback, gate +waiver, or different authority. Use the exact failed main SHA and the refreshed +unique `control_id` so a newer pause remains authoritative. + After exact-main verification, deliver each returned `thread_notifications` message into its native source thread, then call `acknowledge_thread_deployment` with the matching `notification_id`. Leave a diff --git a/docs/reference.md b/docs/reference.md index a1f3347..5a17b07 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -1,7 +1,7 @@ # DeployBot reference This reference describes the CLI, MCP server, policy file, and GitHub Action in -DeployBot v0.2.12. GitHub labels and authenticated comments are the durable state; +DeployBot v0.2.13. GitHub labels and authenticated comments are the durable state; the CLI and MCP tools are two interfaces to the same operations. ## CLI @@ -58,7 +58,7 @@ has fresh evidence; the user does not need to repeat the instruction. | `deploybot integrate [--all]` | Scaffold a cumulative integration PR for overlap groups, or the whole frozen batch with `--all`. | | `deploybot follow [--timeout SECONDS] [--poll SECONDS] [--json]` | Follow the newest exact base-branch head through CI, deployment, and HTTP verification. Defaults: 1800-second timeout and 10-second poll. | | `deploybot pause --reason TEXT` | Pause merging after a delivery failure. | -| `deploybot unpause` | Resume a pipeline after verified recovery. | +| `deploybot unpause --sha SHA --control-id ID` | Conditionally resume the matching failed release after fresh status revalidation and verified repair; a running record can clear only that unique pause, so changed control or advanced main fails closed. The original deploy instruction remains sufficient unless rollback, bypass, or mismatched recovery expands authority. | | `deploybot claim-release-repair --provider CLIENT --thread-id ID [--thread-url URL] [--sha SHA]` | Atomically claim the owner-encoded deterministic repair branch for the current failed exact-main release. Other threads recover the same owner from the ref instead of creating duplicate repair PRs. | Only a configured coordinator should run these operations. `react diff --git a/examples/github-workflow.yml b/examples/github-workflow.yml index 7d5f2d8..af2d2a6 100644 --- a/examples/github-workflow.yml +++ b/examples/github-workflow.yml @@ -59,5 +59,5 @@ jobs: with: ref: ${{ github.event.repository.default_branch }} persist-credentials: false - # v0.2.12 implementation; keep the full commit for privileged workflows. - - uses: Forward-Future/DeployBot@01c8c6e48c3a92155803cd4232b56b0c1d3363c2 + # v0.2.13 implementation; keep the full commit for privileged workflows. + - uses: Forward-Future/DeployBot@992048a90e1db410b9197fe056c591d91c1b2019 diff --git a/pyproject.toml b/pyproject.toml index ce2b12d..d0b5838 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "deploybot-merge-queue" -version = "0.2.12" +version = "0.2.13" description = "DeployBot: a provider-neutral GitHub merge queue for coding agents" readme = "README.md" license = "MIT" diff --git a/skills/deploybot/SKILL.md b/skills/deploybot/SKILL.md index cf88e38..f79f2f6 100644 --- a/skills/deploybot/SKILL.md +++ b/skills/deploybot/SKILL.md @@ -79,6 +79,21 @@ Use `diagnose`/`deploybot doctor` for setup drift and `delivery_metrics` for p50 p95, and slow-stage evidence. A failed cumulative CI or deployment pauses the controller; only a designated coordinator may unpause after recovery. +Immediately before telling the user that the pipeline is paused or asking them +to `unpause`, re-read `pipeline_status` or run `deploybot status --json`. Treat +that fresh durable state as authoritative. If the controller is already +running or the release has advanced, do not repeat a stale action request; +continue coordinating or report the current gate. + +The original `deploy` instruction already authorizes a designated coordinator +to run `deploybot unpause --sha --control-id ` +for the matching failed release when the elected repair head has fresh +required checks and review, the pause reason still names that release, and no +rollback or gate waiver is involved. Revalidate status, unpause, then continue +the merge and release without asking for another user message. Ask the user +only when recovery is unresolved, ownership or SHA does not match, or the next +step requires a rollback, bypass, or expanded authority. + Before opening or editing an exact-main recovery PR, call `claim_release_repair` with the native provider and thread ID. Work only when it returns `owned`, using its deterministic branch. If it returns `claimed`, the diff --git a/skills/manage-merge-queue/SKILL.md b/skills/manage-merge-queue/SKILL.md index c8cdf78..dd3dc91 100644 --- a/skills/manage-merge-queue/SKILL.md +++ b/skills/manage-merge-queue/SKILL.md @@ -56,3 +56,12 @@ new merges closed while an earlier exact-main release is unfinished. Record exact heads, review verdicts, merged commits, waiting items, repair packets, integration groups, and delivery timing. + +Immediately before asking the user to `unpause` or take another repair action, +call `pipeline_status` again. Never show a stale pause prompt when durable state +is already `running` or the release has advanced. The original `deploy` +instruction authorizes the coordinator to unpause the matching failed release +after the elected repair head passes fresh checks and review, provided the pause +reason still matches and no rollback or gate waiver is needed. In that case, +run `deploybot unpause --sha --control-id ` and +continue without asking the user to repeat authorization. diff --git a/src/agent_merge_queue/__init__.py b/src/agent_merge_queue/__init__.py index 82cb24c..bf36d52 100644 --- a/src/agent_merge_queue/__init__.py +++ b/src/agent_merge_queue/__init__.py @@ -1,3 +1,3 @@ """DeployBot: a provider-neutral GitHub merge queue for coding agents.""" -__version__ = "0.2.12" +__version__ = "0.2.13" diff --git a/src/agent_merge_queue/cli.py b/src/agent_merge_queue/cli.py index d3a871a..e2180a6 100755 --- a/src/agent_merge_queue/cli.py +++ b/src/agent_merge_queue/cli.py @@ -8,6 +8,7 @@ import hashlib import json import re +import secrets import shutil import subprocess import sys @@ -31,7 +32,6 @@ summarize_metrics, ) from .records import ( - CONTROL_MARKER, INTEGRATION_MARKER, REPAIR_MARKER, RELEASE_WATERMARK_MARKER, @@ -43,6 +43,7 @@ integration_body, intent_body, latest_intent, + latest_control, latest_release_repair, latest_deployment_notifications, latest_payload, @@ -1393,18 +1394,43 @@ def thread_records(self, *, include_terminal: bool = False) -> list[dict[str, An ] def pipeline_control(self) -> dict[str, Any]: - value = latest_payload( - self.registry_comments(), - CONTROL_MARKER, - self.coordinator_logins, - ) - return value or {"state": "running"} + control = latest_control(self.registry_comments(), self.coordinator_logins) + if ( + control.get("state") == "paused" + and control.get("legacy_control") + and not control.get("main_sha") + ): + # v0.2.12 pause records predate release binding. The immutable + # comment ID still supplies a unique compare-and-set token; bind + # the migration view to the current main and recheck it at write. + return {**control, "main_sha": self.base_sha()} + return control - def set_pipeline_control(self, state: str, reason: str | None = None) -> None: + def set_pipeline_control( + self, + state: str, + reason: str | None = None, + *, + main_sha: str | None = None, + requires_control_id: str | None = None, + resumes_control_id: str | None = None, + ) -> str: number = self.registry_issue_number(create=True) if number is None: # pragma: no cover raise QueueError("could not create DeployBot registry") - self.issue_comment(number, control_body(state=state, reason=reason)) + control_id = secrets.token_hex(16) + self.issue_comment( + number, + control_body( + state=state, + control_id=control_id, + reason=reason, + main_sha=(main_sha or self.base_sha()) if state == "paused" else None, + requires_control_id=requires_control_id, + resumes_control_id=resumes_control_id, + ), + ) + return control_id def verified_main_sha(self) -> str | None: value = latest_payload( @@ -4441,6 +4467,7 @@ def command_follow( client.set_pipeline_control( "paused", f"{result['state']} on {result['main_sha']}", + main_sha=str(result["main_sha"]), ) if result["state"] == "verified": client.record_verified_main(str(result["main_sha"])) @@ -5080,6 +5107,57 @@ def command_control(client: GitHub, *, state: str, reason: str | None) -> None: print(f"DeployBot pipeline is {state}") +def command_unpause( + client: GitHub, + *, + main_sha: str, + control_id: str, +) -> None: + control = client.pipeline_control() + if control.get("state") != "paused": + raise QueueError("DeployBot pipeline is no longer paused; refresh status") + if str(control.get("control_id") or "") != control_id: + raise QueueError("DeployBot pause record changed; refresh status") + if str(control.get("main_sha") or "") != main_sha: + raise QueueError("DeployBot pause belongs to a different main; refresh status") + current_sha = client.base_sha() + if current_sha != main_sha: + raise QueueError( + f"DeployBot main advanced from {main_sha} to {current_sha}; refresh status" + ) + resume_control_id = client.set_pipeline_control( + "running", None, resumes_control_id=control_id + ) + refreshed = client.pipeline_control() + if refreshed.get("state") != "running" or ( + refreshed.get("resumes_control_id") != control_id + ) or refreshed.get("control_id") != resume_control_id: + raise QueueError("DeployBot pause changed during unpause; refresh status") + refreshed_main = client.base_sha() + if refreshed_main != main_sha: + latest_control = client.pipeline_control() + if latest_control.get("state") == "running" and ( + latest_control.get("resumes_control_id") == control_id + ): + client.set_pipeline_control( + "paused", + f"main advanced during unpause from {main_sha} to {refreshed_main}", + main_sha=refreshed_main, + requires_control_id=resume_control_id, + ) + raise QueueError( + f"DeployBot main advanced from {main_sha} to {refreshed_main} " + "during unpause; pipeline remains paused" + ) + # This is the compare-and-set boundary. A main advance after this final + # read occurs after the matching pause was successfully resumed and may be + # the expected repair merge. The release-admission fence then blocks every + # later batch until that newer exact main passes CI, deploy, and health + # verification; binding `running` to the failed SHA forever would instead + # re-pause the repair merge and strand takeover workers. + print(f"DeployBot pipeline is running for recovered main {main_sha}") + + def command_claim_release_repair( client: GitHub, *, @@ -5261,7 +5339,11 @@ def build_parser() -> argparse.ArgumentParser: "pause", help="pause merging after a delivery failure" ) pause.add_argument("--reason", required=True) - subparsers.add_parser("unpause", help="resume a paused delivery pipeline") + unpause = subparsers.add_parser( + "unpause", help="resume the exact revalidated failed release" + ) + unpause.add_argument("--sha", required=True, dest="main_sha") + unpause.add_argument("--control-id", required=True) claim_repair = subparsers.add_parser( "claim-release-repair", help="atomically claim ownership of the current failed release", @@ -5387,7 +5469,11 @@ def main(argv: list[str] | None = None) -> int: elif arguments.command == "pause": command_control(client, state="paused", reason=arguments.reason) elif arguments.command == "unpause": - command_control(client, state="running", reason=None) + command_unpause( + client, + main_sha=arguments.main_sha, + control_id=arguments.control_id, + ) elif arguments.command == "claim-release-repair": command_claim_release_repair( client, diff --git a/src/agent_merge_queue/records.py b/src/agent_merge_queue/records.py index 38a0b85..dcaf7e0 100644 --- a/src/agent_merge_queue/records.py +++ b/src/agent_merge_queue/records.py @@ -120,6 +120,63 @@ def latest_payload( return max(found, key=lambda item: item[0])[1] if found else None +def latest_control( + comments: Iterable[dict[str, Any]], trusted_logins: Iterable[str] +) -> dict[str, Any]: + """Resolve pause/resume records without letting a stale resume clear a new pause.""" + trusted = {value.lower() for value in trusted_logins} + found: list[tuple[tuple[str, int, int], dict[str, Any]]] = [] + for index, comment in enumerate(comments): + if comment_login(comment) not in trusted: + continue + value = _payload(str(comment.get("body") or ""), CONTROL_MARKER) + if value is not None: + if value.get("state") == "paused" and not value.get("control_id"): + comment_id = comment.get("id") + legacy_id = ( + f"legacy-comment:{comment_id}" + if comment_id is not None + else f"legacy-record:{_comment_key(comment, index)}" + ) + value = { + **value, + "control_id": legacy_id, + "legacy_control": True, + } + found.append((_comment_key(comment, index), value)) + + state: dict[str, Any] = {"state": "running"} + for _, value in sorted(found, key=lambda item: item[0]): + if value.get("state") == "paused": + requires_control_id = str(value.get("requires_control_id") or "") + if requires_control_id and not ( + state.get("state") == "running" + and state.get("control_id") == requires_control_id + ): + continue + state = value + continue + if value.get("state") != "running": + continue + resumed_control_id = str(value.get("resumes_control_id") or "") + if not resumed_control_id: + # Backward-compatible unconditional running records from v0.2.12. + # They may clear only legacy pauses; a rolling-upgrade client must + # never override a modern pause that requires a matching token. + if not ( + state.get("state") == "paused" + and state.get("control_id") + and not state.get("legacy_control") + ): + state = value + elif ( + state.get("state") == "paused" + and state.get("control_id") == resumed_control_id + ): + state = value + return state + + @dataclass(frozen=True) class ThreadRecord: provider: str @@ -474,12 +531,31 @@ def latest_release_repair( return max(found, key=lambda item: item[0])[1] if found else None -def control_body(*, state: str, reason: str | None = None) -> str: +def control_body( + *, + state: str, + control_id: str, + reason: str | None = None, + main_sha: str | None = None, + requires_control_id: str | None = None, + resumes_control_id: str | None = None, +) -> str: if state not in {"running", "paused"}: raise ValueError(f"unsupported pipeline control state: {state}") - payload = {"recorded_at": utc_now(), "schema": 1, "state": state} + payload = { + "control_id": control_id, + "recorded_at": utc_now(), + "schema": 1, + "state": state, + } if reason: payload["reason"] = reason + if main_sha: + payload["main_sha"] = main_sha + if requires_control_id: + payload["requires_control_id"] = requires_control_id + if resumes_control_id: + payload["resumes_control_id"] = resumes_control_id return marker_body(CONTROL_PREFIX, payload, "Recorded DeployBot pipeline control.") diff --git a/tests/test_cli.py b/tests/test_cli.py index 24f9cf0..d1d119c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -32,6 +32,7 @@ command_resume, command_thread_acknowledge, command_unblock, + command_unpause, completed_batch_ids, delivery_metrics, deployment_repair_required, @@ -65,6 +66,7 @@ ) from agent_merge_queue.config import parse_config from agent_merge_queue.records import ( + control_body, integration_body, intent_body, release_repair_body, @@ -138,6 +140,32 @@ def deployment_notification( class QueueCoreTest(unittest.TestCase): + def test_follow_binds_pause_to_observed_failed_main(self) -> None: + sha = "a" * 40 + client = Mock() + client.config = CONFIG + release = { + "state": "ci-failed", + "main_sha": sha, + "latest_ci": {}, + "latest_deploy": None, + "verifications": [], + } + + with patch("agent_merge_queue.cli.follow_release", return_value=release): + result = command_follow( + client, + timeout_seconds=10, + poll_seconds=1, + json_output=True, + emit=False, + ) + + self.assertEqual(result, release) + client.set_pipeline_control.assert_called_once_with( + "paused", f"ci-failed on {sha}", main_sha=sha + ) + def test_pull_release_details_reads_human_facing_metadata(self) -> None: client = object.__new__(GitHub) client.repository = "example/repo" @@ -4206,6 +4234,303 @@ def test_reactor_pauses_when_post_merge_ci_dispatch_fails(self) -> None: "paused", "post-merge CI dispatch failed: CI has no dispatch" ) + def test_unpause_compare_and_sets_matching_failed_release(self) -> None: + sha = "a" * 40 + control_id = "pause-1" + client = Mock() + client.pipeline_control.side_effect = [ + { + "state": "paused", + "reason": f"ci-failed on {sha}", + "control_id": control_id, + "main_sha": sha, + }, + { + "state": "running", + "control_id": "resume-1", + "resumes_control_id": control_id, + }, + ] + client.base_sha.return_value = sha + client.set_pipeline_control.return_value = "resume-1" + + with redirect_stdout(io.StringIO()): + command_unpause( + client, + main_sha=sha, + control_id=control_id, + ) + + client.set_pipeline_control.assert_called_once_with( + "running", None, resumes_control_id=control_id + ) + + def test_unpause_rejects_changed_pause_record(self) -> None: + sha = "a" * 40 + client = Mock() + client.pipeline_control.return_value = { + "state": "paused", + "reason": f"ci-failed on {sha}", + "control_id": "newer", + "main_sha": sha, + } + + with self.assertRaisesRegex(QueueError, "pause record changed"): + command_unpause( + client, + main_sha=sha, + control_id="older", + ) + + client.set_pipeline_control.assert_not_called() + + def test_unpause_rejects_advanced_main(self) -> None: + sha = "a" * 40 + client = Mock() + client.pipeline_control.return_value = { + "state": "paused", + "reason": f"ci-failed on {sha}", + "control_id": "same", + "main_sha": sha, + } + client.base_sha.return_value = "b" * 40 + + with self.assertRaisesRegex(QueueError, "main advanced"): + command_unpause( + client, + main_sha=sha, + control_id="same", + ) + + client.set_pipeline_control.assert_not_called() + + def test_unpause_rejects_new_pause_won_during_transition(self) -> None: + sha = "a" * 40 + client = Mock() + client.pipeline_control.side_effect = [ + { + "state": "paused", + "control_id": "pause-1", + "main_sha": sha, + }, + { + "state": "paused", + "control_id": "pause-2", + "main_sha": sha, + }, + ] + client.base_sha.return_value = sha + client.set_pipeline_control.return_value = "resume-1" + + with self.assertRaisesRegex(QueueError, "changed during unpause"): + command_unpause(client, main_sha=sha, control_id="pause-1") + + client.set_pipeline_control.assert_called_once_with( + "running", None, resumes_control_id="pause-1" + ) + + def test_unpause_repauses_when_main_advances_during_transition(self) -> None: + sha = "a" * 40 + newer = "b" * 40 + client = Mock() + client.pipeline_control.side_effect = [ + { + "state": "paused", + "control_id": "pause-1", + "main_sha": sha, + }, + { + "state": "running", + "control_id": "resume-1", + "resumes_control_id": "pause-1", + }, + { + "state": "running", + "control_id": "resume-1", + "resumes_control_id": "pause-1", + }, + ] + client.base_sha.side_effect = [sha, newer] + client.set_pipeline_control.side_effect = ["resume-1", "compensation-1"] + + with self.assertRaisesRegex(QueueError, "pipeline remains paused"): + command_unpause(client, main_sha=sha, control_id="pause-1") + + self.assertEqual( + client.set_pipeline_control.call_args_list, + [ + call("running", None, resumes_control_id="pause-1"), + call( + "paused", + f"main advanced during unpause from {sha} to {newer}", + main_sha=newer, + requires_control_id="resume-1", + ), + ], + ) + + def test_unpause_preserves_newer_pause_when_main_advances(self) -> None: + sha = "a" * 40 + newer = "b" * 40 + client = Mock() + client.pipeline_control.side_effect = [ + { + "state": "paused", + "control_id": "pause-1", + "main_sha": sha, + }, + { + "state": "running", + "control_id": "resume-1", + "resumes_control_id": "pause-1", + }, + { + "state": "paused", + "control_id": "pause-2", + "main_sha": newer, + }, + ] + client.base_sha.side_effect = [sha, newer] + client.set_pipeline_control.return_value = "resume-1" + + with self.assertRaisesRegex(QueueError, "pipeline remains paused"): + command_unpause(client, main_sha=sha, control_id="pause-1") + + client.set_pipeline_control.assert_called_once_with( + "running", None, resumes_control_id="pause-1" + ) + + def test_pipeline_control_ignores_stale_resume_after_new_pause(self) -> None: + sha = "a" * 40 + client = object.__new__(GitHub) + client.coordinator_logins = {"coordinator"} + client.registry_comments = Mock( + return_value=[ + { + "id": 1, + "created_at": "2026-06-21T17:17:13Z", + "user": {"login": "coordinator"}, + "body": control_body( + state="paused", + control_id="pause-1", + reason=f"ci-failed on {sha}", + main_sha=sha, + ), + }, + { + "id": 2, + "created_at": "2026-06-21T17:17:14Z", + "user": {"login": "coordinator"}, + "body": control_body( + state="paused", + control_id="pause-2", + reason=f"deploy-failed on {sha}", + main_sha=sha, + ), + }, + { + "id": 3, + "created_at": "2026-06-21T17:17:15Z", + "user": {"login": "coordinator"}, + "body": control_body( + state="running", + control_id="resume-1", + resumes_control_id="pause-1", + ), + }, + { + "id": 4, + "created_at": "2026-06-21T17:17:16Z", + "user": {"login": "coordinator"}, + "body": ( + '\n' + "Recorded DeployBot pipeline control." + ), + }, + ] + ) + + control = client.pipeline_control() + + self.assertEqual(control["state"], "paused") + self.assertEqual(control["control_id"], "pause-2") + + def test_pipeline_control_ignores_stale_conditional_repause(self) -> None: + sha = "a" * 40 + client = object.__new__(GitHub) + client.coordinator_logins = {"coordinator"} + records = [ + control_body( + state="paused", + control_id="pause-1", + reason=f"ci-failed on {sha}", + main_sha=sha, + ), + control_body( + state="running", + control_id="resume-1", + resumes_control_id="pause-1", + ), + control_body( + state="paused", + control_id="pause-2", + reason=f"deploy-failed on {sha}", + main_sha=sha, + ), + control_body( + state="paused", + control_id="compensation-1", + reason=f"main advanced from {sha}", + main_sha=sha, + requires_control_id="resume-1", + ), + ] + client.registry_comments = Mock( + return_value=[ + { + "id": index, + "created_at": f"2026-06-21T17:17:{index:02d}Z", + "user": {"login": "coordinator"}, + "body": body, + } + for index, body in enumerate(records, start=1) + ] + ) + + control = client.pipeline_control() + + self.assertEqual(control["state"], "paused") + self.assertEqual(control["control_id"], "pause-2") + + def test_pipeline_control_migrates_legacy_pause_with_comment_identity(self) -> None: + sha = "a" * 40 + client = object.__new__(GitHub) + client.coordinator_logins = {"coordinator"} + client.base_sha = Mock(return_value=sha) + client.registry_comments = Mock( + return_value=[ + { + "id": 42, + "created_at": "2026-06-21T17:17:13Z", + "user": {"login": "coordinator"}, + "body": ( + '\n' + "Recorded DeployBot pipeline control." + ), + } + ] + ) + + control = client.pipeline_control() + + self.assertEqual(control["control_id"], "legacy-comment:42") + self.assertEqual(control["main_sha"], sha) + self.assertTrue(control["legacy_control"]) + def test_github_dispatches_each_configured_active_ci_workflow(self) -> None: client = object.__new__(GitHub) client.config = CONFIG diff --git a/tests/test_skill.py b/tests/test_skill.py index 5e4e320..aa83835 100644 --- a/tests/test_skill.py +++ b/tests/test_skill.py @@ -8,7 +8,7 @@ ROOT = Path(__file__).resolve().parents[1] CANONICAL = ROOT / "skills" / "deploybot" / "SKILL.md" -RELEASE_COMMIT = "01c8c6e48c3a92155803cd4232b56b0c1d3363c2" +RELEASE_COMMIT = "992048a90e1db410b9197fe056c591d91c1b2019" CHECKOUT_COMMIT = "9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0" @@ -86,6 +86,45 @@ def test_status_guidance_is_read_only(self) -> None: self.assertIn("Never publish prompts, transcripts", skill) self.assertIn("Never call `freeze_queue` merely to view status", skill) self.assertIn("exact `deploy` instruction", skill) + self.assertIn("Immediately before telling the user", skill) + self.assertIn("do not repeat a stale action request", skill) + self.assertIn("original `deploy` instruction already authorizes", skill) + + def test_every_adapter_revalidates_before_unpause_handoff(self) -> None: + paths = [ + ROOT / "skills" / "manage-merge-queue" / "SKILL.md", + ROOT / "adapters" / "claude-code" / "skills" / "deploybot" / "SKILL.md", + ROOT + / "adapters" + / "claude-code" + / "skills" + / "manage-merge-queue" + / "SKILL.md", + ROOT + / "adapters" + / "codex" + / "agent-merge-queue" + / "skills" + / "deploybot" + / "SKILL.md", + ROOT + / "adapters" + / "codex" + / "agent-merge-queue" + / "skills" + / "manage-merge-queue" + / "SKILL.md", + ROOT / "adapters" / "cursor" / ".cursor" / "rules" / "deploybot.mdc", + ] + for path in paths: + text = path.read_text(encoding="utf-8") + with self.subTest(path=path): + if "adapters/codex" in path.as_posix(): + self.assertIn("deploybot status --json", text) + else: + self.assertIn("pipeline_status", text) + self.assertIn("original", text.lower()) + self.assertIn("unpause", text) def test_cursor_adapter_exposes_status_workflow(self) -> None: rule = (