From 798c60e7936093ee783dea5605a90518266430b9 Mon Sep 17 00:00:00 2001 From: Matthew Berman <748450+mberman84@users.noreply.github.com> Date: Sun, 21 Jun 2026 11:52:54 -0700 Subject: [PATCH 01/11] Prevent stale unpause handoffs --- README.md | 12 +- .../claude-code/.claude-plugin/plugin.json | 2 +- .../claude-code/skills/deploybot/SKILL.md | 15 ++ .../skills/manage-merge-queue/SKILL.md | 9 + .../.codex-plugin/plugin.json | 2 +- .../skills/deploybot/SKILL.md | 15 ++ .../skills/manage-merge-queue/SKILL.md | 10 + adapters/cursor/.cursor/rules/deploybot.mdc | 8 + docs/reference.md | 4 +- examples/github-workflow.yml | 2 +- pyproject.toml | 2 +- skills/deploybot/SKILL.md | 15 ++ skills/manage-merge-queue/SKILL.md | 9 + src/agent_merge_queue/__init__.py | 2 +- src/agent_merge_queue/cli.py | 80 ++++++-- src/agent_merge_queue/records.py | 71 ++++++- tests/test_cli.py | 178 ++++++++++++++++++ tests/test_skill.py | 39 ++++ 18 files changed, 453 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index e96769e..aa4a93e 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ integration PRs, follows `main` through production, and pauses after failures. ## Install -Install the reviewed `v0.2.12` source commit directly from GitHub: +Install the reviewed `v0.2.13` source commit directly from GitHub: ```bash python3 -m pip install \ @@ -190,6 +190,14 @@ deploybot resume and emits a new wake-up event. `follow` tracks newer cumulative `main` revisions until exact CI, deployment, and optional HTTP checks pass. A CI or deploy failure can pause further merges until `deploybot unpause`. +Before presenting an unpause request, adapters must refresh `deploybot status +--json` and suppress stale prompts when the durable controller is already +running or the release advanced. The original deploy instruction authorizes the +coordinator to unpause the matching failed release after its elected repair +head passes fresh checks and review. Pass that status result's failed main SHA +and unique `control_id` to `deploybot unpause --sha SHA --control-id ID` so a +concurrent newer pause remains authoritative. Rollback, +bypass, and mismatched recovery still require explicit user direction. Before starting an exact-main recovery, an agent runs `deploybot claim-release-repair --provider CLIENT --thread-id ID`. A @@ -323,7 +331,7 @@ deploybot integrate [--all] deploybot follow [--timeout SECONDS] [--poll SECONDS] [--json] deploybot metrics --json deploybot pause --reason "main CI failed" -deploybot unpause +deploybot unpause --sha FAILED_MAIN_SHA --control-id PAUSE_CONTROL_ID deploybot block [PR] --reason "..." deploybot unblock [PR] deploybot resume [PR] diff --git a/adapters/claude-code/.claude-plugin/plugin.json b/adapters/claude-code/.claude-plugin/plugin.json index 810ed19..61faa61 100644 --- a/adapters/claude-code/.claude-plugin/plugin.json +++ b/adapters/claude-code/.claude-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "deploybot", - "version": "0.2.12", + "version": "0.2.13", "description": "DeployBot: a provider-neutral GitHub merge queue for coding agents", "author": { "name": "DeployBot contributors" diff --git a/adapters/claude-code/skills/deploybot/SKILL.md b/adapters/claude-code/skills/deploybot/SKILL.md index cf88e38..f79f2f6 100644 --- a/adapters/claude-code/skills/deploybot/SKILL.md +++ b/adapters/claude-code/skills/deploybot/SKILL.md @@ -79,6 +79,21 @@ Use `diagnose`/`deploybot doctor` for setup drift and `delivery_metrics` for p50 p95, and slow-stage evidence. A failed cumulative CI or deployment pauses the controller; only a designated coordinator may unpause after recovery. +Immediately before telling the user that the pipeline is paused or asking them +to `unpause`, re-read `pipeline_status` or run `deploybot status --json`. Treat +that fresh durable state as authoritative. If the controller is already +running or the release has advanced, do not repeat a stale action request; +continue coordinating or report the current gate. + +The original `deploy` instruction already authorizes a designated coordinator +to run `deploybot unpause --sha --control-id ` +for the matching failed release when the elected repair head has fresh +required checks and review, the pause reason still names that release, and no +rollback or gate waiver is involved. Revalidate status, unpause, then continue +the merge and release without asking for another user message. Ask the user +only when recovery is unresolved, ownership or SHA does not match, or the next +step requires a rollback, bypass, or expanded authority. + Before opening or editing an exact-main recovery PR, call `claim_release_repair` with the native provider and thread ID. Work only when it returns `owned`, using its deterministic branch. If it returns `claimed`, the diff --git a/adapters/claude-code/skills/manage-merge-queue/SKILL.md b/adapters/claude-code/skills/manage-merge-queue/SKILL.md index 8d92cb7..e1e2c4f 100644 --- a/adapters/claude-code/skills/manage-merge-queue/SKILL.md +++ b/adapters/claude-code/skills/manage-merge-queue/SKILL.md @@ -29,6 +29,15 @@ returned `owned` thread may use the deterministic repair branch. Respect the maximum batch size and keep new merges closed while an earlier release is unfinished. +Immediately before asking the user to `unpause` or take another repair action, +call `pipeline_status` again. Never show a stale pause prompt when durable state +is already `running` or the release has advanced. The original `deploy` +instruction authorizes the coordinator to unpause the matching failed release +after the elected repair head passes fresh checks and review, provided the pause +reason still matches and no rollback or gate waiver is needed. In that case, +run `deploybot unpause --sha --control-id ` and +continue without asking the user to repeat authorization. + When `follow_release` returns `thread_notifications`, send each supplied message to its native source thread. The source thread calls `acknowledge_thread_deployment` with the matching `notification_id`. Present the diff --git a/adapters/codex/agent-merge-queue/.codex-plugin/plugin.json b/adapters/codex/agent-merge-queue/.codex-plugin/plugin.json index 1dcbe9f..9bc4a48 100644 --- a/adapters/codex/agent-merge-queue/.codex-plugin/plugin.json +++ b/adapters/codex/agent-merge-queue/.codex-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "deploybot", - "version": "0.2.12", + "version": "0.2.13", "description": "Coordinate exact-head pull requests through verified deployment and thread notification", "author": { "name": "DeployBot contributors" diff --git a/adapters/codex/agent-merge-queue/skills/deploybot/SKILL.md b/adapters/codex/agent-merge-queue/skills/deploybot/SKILL.md index 3d9f285..0d81d66 100644 --- a/adapters/codex/agent-merge-queue/skills/deploybot/SKILL.md +++ b/adapters/codex/agent-merge-queue/skills/deploybot/SKILL.md @@ -76,6 +76,21 @@ Use `deploybot doctor --json` for setup drift and `deploybot metrics --json` for p50, p95, and slow-stage evidence. A failed cumulative CI or deployment pauses the controller; only a designated coordinator may unpause after recovery. +Immediately before telling the user that the pipeline is paused or asking them +to `unpause`, run `deploybot status --json` again. Treat that fresh durable +state as authoritative. If the controller is already running or the release +has advanced, do not repeat a stale action request; continue coordinating or +report the current gate. + +The original `deploy` instruction already authorizes a designated coordinator +to run `deploybot unpause --sha --control-id ` +for the matching failed release when the elected +repair head has fresh required checks and review, the pause reason still names +that release, and no rollback or gate waiver is involved. Revalidate status, +unpause, then continue the merge and release without asking for another user +message. Ask the user only when recovery is unresolved, ownership or SHA does +not match, or the next step requires a rollback, bypass, or expanded authority. + Before opening or editing an exact-main recovery PR, run `deploybot claim-release-repair` with the native provider and thread ID. Work only when it returns `owned`, using its deterministic branch. If it returns `claimed`, the diff --git a/adapters/codex/agent-merge-queue/skills/manage-merge-queue/SKILL.md b/adapters/codex/agent-merge-queue/skills/manage-merge-queue/SKILL.md index a8d0937..aba3a34 100644 --- a/adapters/codex/agent-merge-queue/skills/manage-merge-queue/SKILL.md +++ b/adapters/codex/agent-merge-queue/skills/manage-merge-queue/SKILL.md @@ -30,6 +30,16 @@ only the returned `owned` thread may use the deterministic repair branch. Respec maximum batch size and keep new merges closed while an earlier release is unfinished. +Immediately before asking the user to `unpause` or take another repair action, +run `deploybot status --json` again. Never show a stale pause prompt when +durable state is already `running` or the release has advanced. The original +`deploy` instruction authorizes the coordinator to run `deploybot unpause +--sha --control-id ` for +the matching failed release after the elected repair head passes fresh checks +and review, provided the pause reason still matches and no rollback or gate +waiver is needed. In that case, unpause and continue without asking the user to +repeat authorization. + When `deploybot follow --json` returns `thread_notifications`, send each supplied message to its native source thread. In Codex use `send_message_to_thread`; the source thread runs `deploybot thread acknowledge` with the matching diff --git a/adapters/cursor/.cursor/rules/deploybot.mdc b/adapters/cursor/.cursor/rules/deploybot.mdc index fdeec95..4039a71 100644 --- a/adapters/cursor/.cursor/rules/deploybot.mdc +++ b/adapters/cursor/.cursor/rules/deploybot.mdc @@ -15,6 +15,14 @@ the stable Cursor thread ID, never prompts or transcripts. Refresh intent only after replacement-head review. Only the coordinator may react, integrate, drain, follow, pause, or resume repaired work. +Immediately before asking the user to unpause or take repair action, call +`pipeline_status` again and suppress the request if durable state is already +running or the release advanced. The original deploy instruction authorizes the +coordinator to unpause the matching failed release after its elected repair head +passes fresh checks and review, unless recovery requires a rollback, gate +waiver, or different authority. Use the exact failed main SHA and the refreshed +unique `control_id` so a newer pause remains authoritative. + After exact-main verification, deliver each returned `thread_notifications` message into its native source thread, then call `acknowledge_thread_deployment` with the matching `notification_id`. Leave a diff --git a/docs/reference.md b/docs/reference.md index a1f3347..5a17b07 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -1,7 +1,7 @@ # DeployBot reference This reference describes the CLI, MCP server, policy file, and GitHub Action in -DeployBot v0.2.12. GitHub labels and authenticated comments are the durable state; +DeployBot v0.2.13. GitHub labels and authenticated comments are the durable state; the CLI and MCP tools are two interfaces to the same operations. ## CLI @@ -58,7 +58,7 @@ has fresh evidence; the user does not need to repeat the instruction. | `deploybot integrate [--all]` | Scaffold a cumulative integration PR for overlap groups, or the whole frozen batch with `--all`. | | `deploybot follow [--timeout SECONDS] [--poll SECONDS] [--json]` | Follow the newest exact base-branch head through CI, deployment, and HTTP verification. Defaults: 1800-second timeout and 10-second poll. | | `deploybot pause --reason TEXT` | Pause merging after a delivery failure. | -| `deploybot unpause` | Resume a pipeline after verified recovery. | +| `deploybot unpause --sha SHA --control-id ID` | Conditionally resume the matching failed release after fresh status revalidation and verified repair; a running record can clear only that unique pause, so changed control or advanced main fails closed. The original deploy instruction remains sufficient unless rollback, bypass, or mismatched recovery expands authority. | | `deploybot claim-release-repair --provider CLIENT --thread-id ID [--thread-url URL] [--sha SHA]` | Atomically claim the owner-encoded deterministic repair branch for the current failed exact-main release. Other threads recover the same owner from the ref instead of creating duplicate repair PRs. | Only a configured coordinator should run these operations. `react diff --git a/examples/github-workflow.yml b/examples/github-workflow.yml index 7d5f2d8..8d677d3 100644 --- a/examples/github-workflow.yml +++ b/examples/github-workflow.yml @@ -59,5 +59,5 @@ jobs: with: ref: ${{ github.event.repository.default_branch }} persist-credentials: false - # v0.2.12 implementation; keep the full commit for privileged workflows. + # v0.2.13 implementation; keep the full commit for privileged workflows. - uses: Forward-Future/DeployBot@01c8c6e48c3a92155803cd4232b56b0c1d3363c2 diff --git a/pyproject.toml b/pyproject.toml index ce2b12d..d0b5838 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "deploybot-merge-queue" -version = "0.2.12" +version = "0.2.13" description = "DeployBot: a provider-neutral GitHub merge queue for coding agents" readme = "README.md" license = "MIT" diff --git a/skills/deploybot/SKILL.md b/skills/deploybot/SKILL.md index cf88e38..f79f2f6 100644 --- a/skills/deploybot/SKILL.md +++ b/skills/deploybot/SKILL.md @@ -79,6 +79,21 @@ Use `diagnose`/`deploybot doctor` for setup drift and `delivery_metrics` for p50 p95, and slow-stage evidence. A failed cumulative CI or deployment pauses the controller; only a designated coordinator may unpause after recovery. +Immediately before telling the user that the pipeline is paused or asking them +to `unpause`, re-read `pipeline_status` or run `deploybot status --json`. Treat +that fresh durable state as authoritative. If the controller is already +running or the release has advanced, do not repeat a stale action request; +continue coordinating or report the current gate. + +The original `deploy` instruction already authorizes a designated coordinator +to run `deploybot unpause --sha --control-id ` +for the matching failed release when the elected repair head has fresh +required checks and review, the pause reason still names that release, and no +rollback or gate waiver is involved. Revalidate status, unpause, then continue +the merge and release without asking for another user message. Ask the user +only when recovery is unresolved, ownership or SHA does not match, or the next +step requires a rollback, bypass, or expanded authority. + Before opening or editing an exact-main recovery PR, call `claim_release_repair` with the native provider and thread ID. Work only when it returns `owned`, using its deterministic branch. If it returns `claimed`, the diff --git a/skills/manage-merge-queue/SKILL.md b/skills/manage-merge-queue/SKILL.md index c8cdf78..dd3dc91 100644 --- a/skills/manage-merge-queue/SKILL.md +++ b/skills/manage-merge-queue/SKILL.md @@ -56,3 +56,12 @@ new merges closed while an earlier exact-main release is unfinished. Record exact heads, review verdicts, merged commits, waiting items, repair packets, integration groups, and delivery timing. + +Immediately before asking the user to `unpause` or take another repair action, +call `pipeline_status` again. Never show a stale pause prompt when durable state +is already `running` or the release has advanced. The original `deploy` +instruction authorizes the coordinator to unpause the matching failed release +after the elected repair head passes fresh checks and review, provided the pause +reason still matches and no rollback or gate waiver is needed. In that case, +run `deploybot unpause --sha --control-id ` and +continue without asking the user to repeat authorization. diff --git a/src/agent_merge_queue/__init__.py b/src/agent_merge_queue/__init__.py index 82cb24c..bf36d52 100644 --- a/src/agent_merge_queue/__init__.py +++ b/src/agent_merge_queue/__init__.py @@ -1,3 +1,3 @@ """DeployBot: a provider-neutral GitHub merge queue for coding agents.""" -__version__ = "0.2.12" +__version__ = "0.2.13" diff --git a/src/agent_merge_queue/cli.py b/src/agent_merge_queue/cli.py index d3a871a..a3bb5c2 100755 --- a/src/agent_merge_queue/cli.py +++ b/src/agent_merge_queue/cli.py @@ -8,6 +8,7 @@ import hashlib import json import re +import secrets import shutil import subprocess import sys @@ -31,7 +32,6 @@ summarize_metrics, ) from .records import ( - CONTROL_MARKER, INTEGRATION_MARKER, REPAIR_MARKER, RELEASE_WATERMARK_MARKER, @@ -43,6 +43,7 @@ integration_body, intent_body, latest_intent, + latest_control, latest_release_repair, latest_deployment_notifications, latest_payload, @@ -1393,18 +1394,40 @@ def thread_records(self, *, include_terminal: bool = False) -> list[dict[str, An ] def pipeline_control(self) -> dict[str, Any]: - value = latest_payload( - self.registry_comments(), - CONTROL_MARKER, - self.coordinator_logins, - ) - return value or {"state": "running"} + control = latest_control(self.registry_comments(), self.coordinator_logins) + if ( + control.get("state") == "paused" + and control.get("legacy_control") + and not control.get("main_sha") + ): + # v0.2.12 pause records predate release binding. The immutable + # comment ID still supplies a unique compare-and-set token; bind + # the migration view to the current main and recheck it at write. + return {**control, "main_sha": self.base_sha()} + return control - def set_pipeline_control(self, state: str, reason: str | None = None) -> None: + def set_pipeline_control( + self, + state: str, + reason: str | None = None, + *, + resumes_control_id: str | None = None, + ) -> str: number = self.registry_issue_number(create=True) if number is None: # pragma: no cover raise QueueError("could not create DeployBot registry") - self.issue_comment(number, control_body(state=state, reason=reason)) + control_id = secrets.token_hex(16) + self.issue_comment( + number, + control_body( + state=state, + control_id=control_id, + reason=reason, + main_sha=self.base_sha() if state == "paused" else None, + resumes_control_id=resumes_control_id, + ), + ) + return control_id def verified_main_sha(self) -> str | None: value = latest_payload( @@ -5080,6 +5103,33 @@ def command_control(client: GitHub, *, state: str, reason: str | None) -> None: print(f"DeployBot pipeline is {state}") +def command_unpause( + client: GitHub, + *, + main_sha: str, + control_id: str, +) -> None: + control = client.pipeline_control() + if control.get("state") != "paused": + raise QueueError("DeployBot pipeline is no longer paused; refresh status") + if str(control.get("control_id") or "") != control_id: + raise QueueError("DeployBot pause record changed; refresh status") + if str(control.get("main_sha") or "") != main_sha: + raise QueueError("DeployBot pause belongs to a different main; refresh status") + current_sha = client.base_sha() + if current_sha != main_sha: + raise QueueError( + f"DeployBot main advanced from {main_sha} to {current_sha}; refresh status" + ) + client.set_pipeline_control("running", None, resumes_control_id=control_id) + refreshed = client.pipeline_control() + if refreshed.get("state") != "running" or ( + refreshed.get("resumes_control_id") != control_id + ): + raise QueueError("DeployBot pause changed during unpause; refresh status") + print(f"DeployBot pipeline is running for recovered main {main_sha}") + + def command_claim_release_repair( client: GitHub, *, @@ -5261,7 +5311,11 @@ def build_parser() -> argparse.ArgumentParser: "pause", help="pause merging after a delivery failure" ) pause.add_argument("--reason", required=True) - subparsers.add_parser("unpause", help="resume a paused delivery pipeline") + unpause = subparsers.add_parser( + "unpause", help="resume the exact revalidated failed release" + ) + unpause.add_argument("--sha", required=True, dest="main_sha") + unpause.add_argument("--control-id", required=True) claim_repair = subparsers.add_parser( "claim-release-repair", help="atomically claim ownership of the current failed release", @@ -5387,7 +5441,11 @@ def main(argv: list[str] | None = None) -> int: elif arguments.command == "pause": command_control(client, state="paused", reason=arguments.reason) elif arguments.command == "unpause": - command_control(client, state="running", reason=None) + command_unpause( + client, + main_sha=arguments.main_sha, + control_id=arguments.control_id, + ) elif arguments.command == "claim-release-repair": command_claim_release_repair( client, diff --git a/src/agent_merge_queue/records.py b/src/agent_merge_queue/records.py index 38a0b85..6c682f0 100644 --- a/src/agent_merge_queue/records.py +++ b/src/agent_merge_queue/records.py @@ -120,6 +120,57 @@ def latest_payload( return max(found, key=lambda item: item[0])[1] if found else None +def latest_control( + comments: Iterable[dict[str, Any]], trusted_logins: Iterable[str] +) -> dict[str, Any]: + """Resolve pause/resume records without letting a stale resume clear a new pause.""" + trusted = {value.lower() for value in trusted_logins} + found: list[tuple[tuple[str, int, int], dict[str, Any]]] = [] + for index, comment in enumerate(comments): + if comment_login(comment) not in trusted: + continue + value = _payload(str(comment.get("body") or ""), CONTROL_MARKER) + if value is not None: + if value.get("state") == "paused" and not value.get("control_id"): + comment_id = comment.get("id") + legacy_id = ( + f"legacy-comment:{comment_id}" + if comment_id is not None + else f"legacy-record:{_comment_key(comment, index)}" + ) + value = { + **value, + "control_id": legacy_id, + "legacy_control": True, + } + found.append((_comment_key(comment, index), value)) + + state: dict[str, Any] = {"state": "running"} + for _, value in sorted(found, key=lambda item: item[0]): + if value.get("state") == "paused": + state = value + continue + if value.get("state") != "running": + continue + resumed_control_id = str(value.get("resumes_control_id") or "") + if not resumed_control_id: + # Backward-compatible unconditional running records from v0.2.12. + # They may clear only legacy pauses; a rolling-upgrade client must + # never override a modern pause that requires a matching token. + if not ( + state.get("state") == "paused" + and state.get("control_id") + and not state.get("legacy_control") + ): + state = value + elif ( + state.get("state") == "paused" + and state.get("control_id") == resumed_control_id + ): + state = value + return state + + @dataclass(frozen=True) class ThreadRecord: provider: str @@ -474,12 +525,28 @@ def latest_release_repair( return max(found, key=lambda item: item[0])[1] if found else None -def control_body(*, state: str, reason: str | None = None) -> str: +def control_body( + *, + state: str, + control_id: str, + reason: str | None = None, + main_sha: str | None = None, + resumes_control_id: str | None = None, +) -> str: if state not in {"running", "paused"}: raise ValueError(f"unsupported pipeline control state: {state}") - payload = {"recorded_at": utc_now(), "schema": 1, "state": state} + payload = { + "control_id": control_id, + "recorded_at": utc_now(), + "schema": 1, + "state": state, + } if reason: payload["reason"] = reason + if main_sha: + payload["main_sha"] = main_sha + if resumes_control_id: + payload["resumes_control_id"] = resumes_control_id return marker_body(CONTROL_PREFIX, payload, "Recorded DeployBot pipeline control.") diff --git a/tests/test_cli.py b/tests/test_cli.py index 24f9cf0..39d1d88 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -32,6 +32,7 @@ command_resume, command_thread_acknowledge, command_unblock, + command_unpause, completed_batch_ids, delivery_metrics, deployment_repair_required, @@ -65,6 +66,7 @@ ) from agent_merge_queue.config import parse_config from agent_merge_queue.records import ( + control_body, integration_body, intent_body, release_repair_body, @@ -4206,6 +4208,182 @@ def test_reactor_pauses_when_post_merge_ci_dispatch_fails(self) -> None: "paused", "post-merge CI dispatch failed: CI has no dispatch" ) + def test_unpause_compare_and_sets_matching_failed_release(self) -> None: + sha = "a" * 40 + control_id = "pause-1" + client = Mock() + client.pipeline_control.side_effect = [ + { + "state": "paused", + "reason": f"ci-failed on {sha}", + "control_id": control_id, + "main_sha": sha, + }, + { + "state": "running", + "resumes_control_id": control_id, + }, + ] + client.base_sha.return_value = sha + + with redirect_stdout(io.StringIO()): + command_unpause( + client, + main_sha=sha, + control_id=control_id, + ) + + client.set_pipeline_control.assert_called_once_with( + "running", None, resumes_control_id=control_id + ) + + def test_unpause_rejects_changed_pause_record(self) -> None: + sha = "a" * 40 + client = Mock() + client.pipeline_control.return_value = { + "state": "paused", + "reason": f"ci-failed on {sha}", + "control_id": "newer", + "main_sha": sha, + } + + with self.assertRaisesRegex(QueueError, "pause record changed"): + command_unpause( + client, + main_sha=sha, + control_id="older", + ) + + client.set_pipeline_control.assert_not_called() + + def test_unpause_rejects_advanced_main(self) -> None: + sha = "a" * 40 + client = Mock() + client.pipeline_control.return_value = { + "state": "paused", + "reason": f"ci-failed on {sha}", + "control_id": "same", + "main_sha": sha, + } + client.base_sha.return_value = "b" * 40 + + with self.assertRaisesRegex(QueueError, "main advanced"): + command_unpause( + client, + main_sha=sha, + control_id="same", + ) + + client.set_pipeline_control.assert_not_called() + + def test_unpause_rejects_new_pause_won_during_transition(self) -> None: + sha = "a" * 40 + client = Mock() + client.pipeline_control.side_effect = [ + { + "state": "paused", + "control_id": "pause-1", + "main_sha": sha, + }, + { + "state": "paused", + "control_id": "pause-2", + "main_sha": sha, + }, + ] + client.base_sha.return_value = sha + + with self.assertRaisesRegex(QueueError, "changed during unpause"): + command_unpause(client, main_sha=sha, control_id="pause-1") + + client.set_pipeline_control.assert_called_once_with( + "running", None, resumes_control_id="pause-1" + ) + + def test_pipeline_control_ignores_stale_resume_after_new_pause(self) -> None: + sha = "a" * 40 + client = object.__new__(GitHub) + client.coordinator_logins = {"coordinator"} + client.registry_comments = Mock( + return_value=[ + { + "id": 1, + "created_at": "2026-06-21T17:17:13Z", + "user": {"login": "coordinator"}, + "body": control_body( + state="paused", + control_id="pause-1", + reason=f"ci-failed on {sha}", + main_sha=sha, + ), + }, + { + "id": 2, + "created_at": "2026-06-21T17:17:14Z", + "user": {"login": "coordinator"}, + "body": control_body( + state="paused", + control_id="pause-2", + reason=f"deploy-failed on {sha}", + main_sha=sha, + ), + }, + { + "id": 3, + "created_at": "2026-06-21T17:17:15Z", + "user": {"login": "coordinator"}, + "body": control_body( + state="running", + control_id="resume-1", + resumes_control_id="pause-1", + ), + }, + { + "id": 4, + "created_at": "2026-06-21T17:17:16Z", + "user": {"login": "coordinator"}, + "body": ( + '\n' + "Recorded DeployBot pipeline control." + ), + }, + ] + ) + + control = client.pipeline_control() + + self.assertEqual(control["state"], "paused") + self.assertEqual(control["control_id"], "pause-2") + + def test_pipeline_control_migrates_legacy_pause_with_comment_identity(self) -> None: + sha = "a" * 40 + client = object.__new__(GitHub) + client.coordinator_logins = {"coordinator"} + client.base_sha = Mock(return_value=sha) + client.registry_comments = Mock( + return_value=[ + { + "id": 42, + "created_at": "2026-06-21T17:17:13Z", + "user": {"login": "coordinator"}, + "body": ( + '\n' + "Recorded DeployBot pipeline control." + ), + } + ] + ) + + control = client.pipeline_control() + + self.assertEqual(control["control_id"], "legacy-comment:42") + self.assertEqual(control["main_sha"], sha) + self.assertTrue(control["legacy_control"]) + def test_github_dispatches_each_configured_active_ci_workflow(self) -> None: client = object.__new__(GitHub) client.config = CONFIG diff --git a/tests/test_skill.py b/tests/test_skill.py index 5e4e320..9b35b2a 100644 --- a/tests/test_skill.py +++ b/tests/test_skill.py @@ -86,6 +86,45 @@ def test_status_guidance_is_read_only(self) -> None: self.assertIn("Never publish prompts, transcripts", skill) self.assertIn("Never call `freeze_queue` merely to view status", skill) self.assertIn("exact `deploy` instruction", skill) + self.assertIn("Immediately before telling the user", skill) + self.assertIn("do not repeat a stale action request", skill) + self.assertIn("original `deploy` instruction already authorizes", skill) + + def test_every_adapter_revalidates_before_unpause_handoff(self) -> None: + paths = [ + ROOT / "skills" / "manage-merge-queue" / "SKILL.md", + ROOT / "adapters" / "claude-code" / "skills" / "deploybot" / "SKILL.md", + ROOT + / "adapters" + / "claude-code" + / "skills" + / "manage-merge-queue" + / "SKILL.md", + ROOT + / "adapters" + / "codex" + / "agent-merge-queue" + / "skills" + / "deploybot" + / "SKILL.md", + ROOT + / "adapters" + / "codex" + / "agent-merge-queue" + / "skills" + / "manage-merge-queue" + / "SKILL.md", + ROOT / "adapters" / "cursor" / ".cursor" / "rules" / "deploybot.mdc", + ] + for path in paths: + text = path.read_text(encoding="utf-8") + with self.subTest(path=path): + if "adapters/codex" in path.as_posix(): + self.assertIn("deploybot status --json", text) + else: + self.assertIn("pipeline_status", text) + self.assertIn("original", text.lower()) + self.assertIn("unpause", text) def test_cursor_adapter_exposes_status_workflow(self) -> None: rule = ( From d0420c5ca145d44ca87acb684875eb6d485a9c76 Mon Sep 17 00:00:00 2001 From: Matthew Berman <748450+mberman84@users.noreply.github.com> Date: Sun, 21 Jun 2026 11:53:15 -0700 Subject: [PATCH 02/11] Pin DeployBot v0.2.13 runtime --- README.md | 4 ++-- adapters/claude-code/.mcp.json | 2 +- adapters/cursor/.cursor/mcp.json | 2 +- examples/github-workflow.yml | 2 +- tests/test_skill.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index aa4a93e..164f0f5 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Install the reviewed `v0.2.13` source commit directly from GitHub: ```bash python3 -m pip install \ - 'deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@01c8c6e48c3a92155803cd4232b56b0c1d3363c2' + 'deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@798c60e7936093ee783dea5605a90518266430b9' deploybot init ``` @@ -95,7 +95,7 @@ worker can dispatch deployment when GitHub suppresses the `workflow_run` event for token-dispatched CI. Pin the Action to the full reviewed release commit: ```yaml -- uses: Forward-Future/DeployBot@01c8c6e48c3a92155803cd4232b56b0c1d3363c2 +- uses: Forward-Future/DeployBot@798c60e7936093ee783dea5605a90518266430b9 ``` The Action uses GitHub's built-in workflow token. GitHub intentionally does not diff --git a/adapters/claude-code/.mcp.json b/adapters/claude-code/.mcp.json index 49ebaf9..21bc382 100644 --- a/adapters/claude-code/.mcp.json +++ b/adapters/claude-code/.mcp.json @@ -4,7 +4,7 @@ "command": "uvx", "args": [ "--from", - "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@01c8c6e48c3a92155803cd4232b56b0c1d3363c2", + "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@798c60e7936093ee783dea5605a90518266430b9", "deploybot-mcp" ] } diff --git a/adapters/cursor/.cursor/mcp.json b/adapters/cursor/.cursor/mcp.json index 49ebaf9..21bc382 100644 --- a/adapters/cursor/.cursor/mcp.json +++ b/adapters/cursor/.cursor/mcp.json @@ -4,7 +4,7 @@ "command": "uvx", "args": [ "--from", - "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@01c8c6e48c3a92155803cd4232b56b0c1d3363c2", + "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@798c60e7936093ee783dea5605a90518266430b9", "deploybot-mcp" ] } diff --git a/examples/github-workflow.yml b/examples/github-workflow.yml index 8d677d3..bf09cbf 100644 --- a/examples/github-workflow.yml +++ b/examples/github-workflow.yml @@ -60,4 +60,4 @@ jobs: ref: ${{ github.event.repository.default_branch }} persist-credentials: false # v0.2.13 implementation; keep the full commit for privileged workflows. - - uses: Forward-Future/DeployBot@01c8c6e48c3a92155803cd4232b56b0c1d3363c2 + - uses: Forward-Future/DeployBot@798c60e7936093ee783dea5605a90518266430b9 diff --git a/tests/test_skill.py b/tests/test_skill.py index 9b35b2a..0558b23 100644 --- a/tests/test_skill.py +++ b/tests/test_skill.py @@ -8,7 +8,7 @@ ROOT = Path(__file__).resolve().parents[1] CANONICAL = ROOT / "skills" / "deploybot" / "SKILL.md" -RELEASE_COMMIT = "01c8c6e48c3a92155803cd4232b56b0c1d3363c2" +RELEASE_COMMIT = "798c60e7936093ee783dea5605a90518266430b9" CHECKOUT_COMMIT = "9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0" From 8aae016e1c52d9084d778e59d9cad7476c4ce96c Mon Sep 17 00:00:00 2001 From: Matthew Berman <748450+mberman84@users.noreply.github.com> Date: Sun, 21 Jun 2026 11:55:11 -0700 Subject: [PATCH 03/11] Bind pause control to observed release heads --- src/agent_merge_queue/cli.py | 15 +++++++++- tests/test_cli.py | 58 ++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 1 deletion(-) diff --git a/src/agent_merge_queue/cli.py b/src/agent_merge_queue/cli.py index a3bb5c2..a4550d8 100755 --- a/src/agent_merge_queue/cli.py +++ b/src/agent_merge_queue/cli.py @@ -1411,6 +1411,7 @@ def set_pipeline_control( state: str, reason: str | None = None, *, + main_sha: str | None = None, resumes_control_id: str | None = None, ) -> str: number = self.registry_issue_number(create=True) @@ -1423,7 +1424,7 @@ def set_pipeline_control( state=state, control_id=control_id, reason=reason, - main_sha=self.base_sha() if state == "paused" else None, + main_sha=(main_sha or self.base_sha()) if state == "paused" else None, resumes_control_id=resumes_control_id, ), ) @@ -4464,6 +4465,7 @@ def command_follow( client.set_pipeline_control( "paused", f"{result['state']} on {result['main_sha']}", + main_sha=str(result["main_sha"]), ) if result["state"] == "verified": client.record_verified_main(str(result["main_sha"])) @@ -5127,6 +5129,17 @@ def command_unpause( refreshed.get("resumes_control_id") != control_id ): raise QueueError("DeployBot pause changed during unpause; refresh status") + refreshed_main = client.base_sha() + if refreshed_main != main_sha: + client.set_pipeline_control( + "paused", + f"main advanced during unpause from {main_sha} to {refreshed_main}", + main_sha=refreshed_main, + ) + raise QueueError( + f"DeployBot main advanced from {main_sha} to {refreshed_main} " + "during unpause; pipeline remains paused" + ) print(f"DeployBot pipeline is running for recovered main {main_sha}") diff --git a/tests/test_cli.py b/tests/test_cli.py index 39d1d88..79cc3f1 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -140,6 +140,32 @@ def deployment_notification( class QueueCoreTest(unittest.TestCase): + def test_follow_binds_pause_to_observed_failed_main(self) -> None: + sha = "a" * 40 + client = Mock() + client.config = CONFIG + release = { + "state": "ci-failed", + "main_sha": sha, + "latest_ci": {}, + "latest_deploy": None, + "verifications": [], + } + + with patch("agent_merge_queue.cli.follow_release", return_value=release): + result = command_follow( + client, + timeout_seconds=10, + poll_seconds=1, + json_output=True, + emit=False, + ) + + self.assertEqual(result, release) + client.set_pipeline_control.assert_called_once_with( + "paused", f"ci-failed on {sha}", main_sha=sha + ) + def test_pull_release_details_reads_human_facing_metadata(self) -> None: client = object.__new__(GitHub) client.repository = "example/repo" @@ -4300,6 +4326,38 @@ def test_unpause_rejects_new_pause_won_during_transition(self) -> None: "running", None, resumes_control_id="pause-1" ) + def test_unpause_repauses_when_main_advances_during_transition(self) -> None: + sha = "a" * 40 + newer = "b" * 40 + client = Mock() + client.pipeline_control.side_effect = [ + { + "state": "paused", + "control_id": "pause-1", + "main_sha": sha, + }, + { + "state": "running", + "resumes_control_id": "pause-1", + }, + ] + client.base_sha.side_effect = [sha, newer] + + with self.assertRaisesRegex(QueueError, "pipeline remains paused"): + command_unpause(client, main_sha=sha, control_id="pause-1") + + self.assertEqual( + client.set_pipeline_control.call_args_list, + [ + call("running", None, resumes_control_id="pause-1"), + call( + "paused", + f"main advanced during unpause from {sha} to {newer}", + main_sha=newer, + ), + ], + ) + def test_pipeline_control_ignores_stale_resume_after_new_pause(self) -> None: sha = "a" * 40 client = object.__new__(GitHub) From 1eedf451fc9e71f33814e295bfc6cdd1ac5019ea Mon Sep 17 00:00:00 2001 From: Matthew Berman <748450+mberman84@users.noreply.github.com> Date: Sun, 21 Jun 2026 11:55:23 -0700 Subject: [PATCH 04/11] Repin DeployBot v0.2.13 after control hardening --- README.md | 4 ++-- adapters/claude-code/.mcp.json | 2 +- adapters/cursor/.cursor/mcp.json | 2 +- examples/github-workflow.yml | 2 +- tests/test_skill.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 164f0f5..d6a4035 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Install the reviewed `v0.2.13` source commit directly from GitHub: ```bash python3 -m pip install \ - 'deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@798c60e7936093ee783dea5605a90518266430b9' + 'deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@8aae016e1c52d9084d778e59d9cad7476c4ce96c' deploybot init ``` @@ -95,7 +95,7 @@ worker can dispatch deployment when GitHub suppresses the `workflow_run` event for token-dispatched CI. Pin the Action to the full reviewed release commit: ```yaml -- uses: Forward-Future/DeployBot@798c60e7936093ee783dea5605a90518266430b9 +- uses: Forward-Future/DeployBot@8aae016e1c52d9084d778e59d9cad7476c4ce96c ``` The Action uses GitHub's built-in workflow token. GitHub intentionally does not diff --git a/adapters/claude-code/.mcp.json b/adapters/claude-code/.mcp.json index 21bc382..50897dd 100644 --- a/adapters/claude-code/.mcp.json +++ b/adapters/claude-code/.mcp.json @@ -4,7 +4,7 @@ "command": "uvx", "args": [ "--from", - "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@798c60e7936093ee783dea5605a90518266430b9", + "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@8aae016e1c52d9084d778e59d9cad7476c4ce96c", "deploybot-mcp" ] } diff --git a/adapters/cursor/.cursor/mcp.json b/adapters/cursor/.cursor/mcp.json index 21bc382..50897dd 100644 --- a/adapters/cursor/.cursor/mcp.json +++ b/adapters/cursor/.cursor/mcp.json @@ -4,7 +4,7 @@ "command": "uvx", "args": [ "--from", - "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@798c60e7936093ee783dea5605a90518266430b9", + "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@8aae016e1c52d9084d778e59d9cad7476c4ce96c", "deploybot-mcp" ] } diff --git a/examples/github-workflow.yml b/examples/github-workflow.yml index bf09cbf..c6f3cc0 100644 --- a/examples/github-workflow.yml +++ b/examples/github-workflow.yml @@ -60,4 +60,4 @@ jobs: ref: ${{ github.event.repository.default_branch }} persist-credentials: false # v0.2.13 implementation; keep the full commit for privileged workflows. - - uses: Forward-Future/DeployBot@798c60e7936093ee783dea5605a90518266430b9 + - uses: Forward-Future/DeployBot@8aae016e1c52d9084d778e59d9cad7476c4ce96c diff --git a/tests/test_skill.py b/tests/test_skill.py index 0558b23..cff241b 100644 --- a/tests/test_skill.py +++ b/tests/test_skill.py @@ -8,7 +8,7 @@ ROOT = Path(__file__).resolve().parents[1] CANONICAL = ROOT / "skills" / "deploybot" / "SKILL.md" -RELEASE_COMMIT = "798c60e7936093ee783dea5605a90518266430b9" +RELEASE_COMMIT = "8aae016e1c52d9084d778e59d9cad7476c4ce96c" CHECKOUT_COMMIT = "9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0" From 65cb87a83469008179c75e13c39414d0bda611ad Mon Sep 17 00:00:00 2001 From: Matthew Berman <748450+mberman84@users.noreply.github.com> Date: Sun, 21 Jun 2026 11:56:42 -0700 Subject: [PATCH 05/11] Preserve concurrent pause ownership --- src/agent_merge_queue/cli.py | 14 +++++++++----- tests/test_cli.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/src/agent_merge_queue/cli.py b/src/agent_merge_queue/cli.py index a4550d8..bdf6c7b 100755 --- a/src/agent_merge_queue/cli.py +++ b/src/agent_merge_queue/cli.py @@ -5131,11 +5131,15 @@ def command_unpause( raise QueueError("DeployBot pause changed during unpause; refresh status") refreshed_main = client.base_sha() if refreshed_main != main_sha: - client.set_pipeline_control( - "paused", - f"main advanced during unpause from {main_sha} to {refreshed_main}", - main_sha=refreshed_main, - ) + latest_control = client.pipeline_control() + if latest_control.get("state") == "running" and ( + latest_control.get("resumes_control_id") == control_id + ): + client.set_pipeline_control( + "paused", + f"main advanced during unpause from {main_sha} to {refreshed_main}", + main_sha=refreshed_main, + ) raise QueueError( f"DeployBot main advanced from {main_sha} to {refreshed_main} " "during unpause; pipeline remains paused" diff --git a/tests/test_cli.py b/tests/test_cli.py index 79cc3f1..12f3491 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -4340,6 +4340,10 @@ def test_unpause_repauses_when_main_advances_during_transition(self) -> None: "state": "running", "resumes_control_id": "pause-1", }, + { + "state": "running", + "resumes_control_id": "pause-1", + }, ] client.base_sha.side_effect = [sha, newer] @@ -4358,6 +4362,35 @@ def test_unpause_repauses_when_main_advances_during_transition(self) -> None: ], ) + def test_unpause_preserves_newer_pause_when_main_advances(self) -> None: + sha = "a" * 40 + newer = "b" * 40 + client = Mock() + client.pipeline_control.side_effect = [ + { + "state": "paused", + "control_id": "pause-1", + "main_sha": sha, + }, + { + "state": "running", + "resumes_control_id": "pause-1", + }, + { + "state": "paused", + "control_id": "pause-2", + "main_sha": newer, + }, + ] + client.base_sha.side_effect = [sha, newer] + + with self.assertRaisesRegex(QueueError, "pipeline remains paused"): + command_unpause(client, main_sha=sha, control_id="pause-1") + + client.set_pipeline_control.assert_called_once_with( + "running", None, resumes_control_id="pause-1" + ) + def test_pipeline_control_ignores_stale_resume_after_new_pause(self) -> None: sha = "a" * 40 client = object.__new__(GitHub) From 722a50fd9dca7a598afc5ccec6079638c72a6158 Mon Sep 17 00:00:00 2001 From: Matthew Berman <748450+mberman84@users.noreply.github.com> Date: Sun, 21 Jun 2026 11:56:51 -0700 Subject: [PATCH 06/11] Finalize DeployBot v0.2.13 runtime pin --- README.md | 4 ++-- adapters/claude-code/.mcp.json | 2 +- adapters/cursor/.cursor/mcp.json | 2 +- examples/github-workflow.yml | 2 +- tests/test_skill.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index d6a4035..405d140 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Install the reviewed `v0.2.13` source commit directly from GitHub: ```bash python3 -m pip install \ - 'deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@8aae016e1c52d9084d778e59d9cad7476c4ce96c' + 'deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@65cb87a83469008179c75e13c39414d0bda611ad' deploybot init ``` @@ -95,7 +95,7 @@ worker can dispatch deployment when GitHub suppresses the `workflow_run` event for token-dispatched CI. Pin the Action to the full reviewed release commit: ```yaml -- uses: Forward-Future/DeployBot@8aae016e1c52d9084d778e59d9cad7476c4ce96c +- uses: Forward-Future/DeployBot@65cb87a83469008179c75e13c39414d0bda611ad ``` The Action uses GitHub's built-in workflow token. GitHub intentionally does not diff --git a/adapters/claude-code/.mcp.json b/adapters/claude-code/.mcp.json index 50897dd..cafc062 100644 --- a/adapters/claude-code/.mcp.json +++ b/adapters/claude-code/.mcp.json @@ -4,7 +4,7 @@ "command": "uvx", "args": [ "--from", - "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@8aae016e1c52d9084d778e59d9cad7476c4ce96c", + "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@65cb87a83469008179c75e13c39414d0bda611ad", "deploybot-mcp" ] } diff --git a/adapters/cursor/.cursor/mcp.json b/adapters/cursor/.cursor/mcp.json index 50897dd..cafc062 100644 --- a/adapters/cursor/.cursor/mcp.json +++ b/adapters/cursor/.cursor/mcp.json @@ -4,7 +4,7 @@ "command": "uvx", "args": [ "--from", - "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@8aae016e1c52d9084d778e59d9cad7476c4ce96c", + "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@65cb87a83469008179c75e13c39414d0bda611ad", "deploybot-mcp" ] } diff --git a/examples/github-workflow.yml b/examples/github-workflow.yml index c6f3cc0..b42e735 100644 --- a/examples/github-workflow.yml +++ b/examples/github-workflow.yml @@ -60,4 +60,4 @@ jobs: ref: ${{ github.event.repository.default_branch }} persist-credentials: false # v0.2.13 implementation; keep the full commit for privileged workflows. - - uses: Forward-Future/DeployBot@8aae016e1c52d9084d778e59d9cad7476c4ce96c + - uses: Forward-Future/DeployBot@65cb87a83469008179c75e13c39414d0bda611ad diff --git a/tests/test_skill.py b/tests/test_skill.py index cff241b..9852da3 100644 --- a/tests/test_skill.py +++ b/tests/test_skill.py @@ -8,7 +8,7 @@ ROOT = Path(__file__).resolve().parents[1] CANONICAL = ROOT / "skills" / "deploybot" / "SKILL.md" -RELEASE_COMMIT = "8aae016e1c52d9084d778e59d9cad7476c4ce96c" +RELEASE_COMMIT = "65cb87a83469008179c75e13c39414d0bda611ad" CHECKOUT_COMMIT = "9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0" From 14a838f363c98a2a2918a52aa1225d880519f7c0 Mon Sep 17 00:00:00 2001 From: Matthew Berman <748450+mberman84@users.noreply.github.com> Date: Sun, 21 Jun 2026 11:58:06 -0700 Subject: [PATCH 07/11] Document release-fence ownership after unpause --- src/agent_merge_queue/cli.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/agent_merge_queue/cli.py b/src/agent_merge_queue/cli.py index bdf6c7b..511bab2 100755 --- a/src/agent_merge_queue/cli.py +++ b/src/agent_merge_queue/cli.py @@ -5144,6 +5144,12 @@ def command_unpause( f"DeployBot main advanced from {main_sha} to {refreshed_main} " "during unpause; pipeline remains paused" ) + # This is the compare-and-set boundary. A main advance after this final + # read occurs after the matching pause was successfully resumed and may be + # the expected repair merge. The release-admission fence then blocks every + # later batch until that newer exact main passes CI, deploy, and health + # verification; binding `running` to the failed SHA forever would instead + # re-pause the repair merge and strand takeover workers. print(f"DeployBot pipeline is running for recovered main {main_sha}") From 49007d52facac5a23b511c06f8090b92b37fcda8 Mon Sep 17 00:00:00 2001 From: Matthew Berman <748450+mberman84@users.noreply.github.com> Date: Sun, 21 Jun 2026 11:58:15 -0700 Subject: [PATCH 08/11] Pin final DeployBot v0.2.13 runtime --- README.md | 4 ++-- adapters/claude-code/.mcp.json | 2 +- adapters/cursor/.cursor/mcp.json | 2 +- examples/github-workflow.yml | 2 +- tests/test_skill.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 405d140..541d2b7 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Install the reviewed `v0.2.13` source commit directly from GitHub: ```bash python3 -m pip install \ - 'deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@65cb87a83469008179c75e13c39414d0bda611ad' + 'deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@14a838f363c98a2a2918a52aa1225d880519f7c0' deploybot init ``` @@ -95,7 +95,7 @@ worker can dispatch deployment when GitHub suppresses the `workflow_run` event for token-dispatched CI. Pin the Action to the full reviewed release commit: ```yaml -- uses: Forward-Future/DeployBot@65cb87a83469008179c75e13c39414d0bda611ad +- uses: Forward-Future/DeployBot@14a838f363c98a2a2918a52aa1225d880519f7c0 ``` The Action uses GitHub's built-in workflow token. GitHub intentionally does not diff --git a/adapters/claude-code/.mcp.json b/adapters/claude-code/.mcp.json index cafc062..35c62b6 100644 --- a/adapters/claude-code/.mcp.json +++ b/adapters/claude-code/.mcp.json @@ -4,7 +4,7 @@ "command": "uvx", "args": [ "--from", - "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@65cb87a83469008179c75e13c39414d0bda611ad", + "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@14a838f363c98a2a2918a52aa1225d880519f7c0", "deploybot-mcp" ] } diff --git a/adapters/cursor/.cursor/mcp.json b/adapters/cursor/.cursor/mcp.json index cafc062..35c62b6 100644 --- a/adapters/cursor/.cursor/mcp.json +++ b/adapters/cursor/.cursor/mcp.json @@ -4,7 +4,7 @@ "command": "uvx", "args": [ "--from", - "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@65cb87a83469008179c75e13c39414d0bda611ad", + "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@14a838f363c98a2a2918a52aa1225d880519f7c0", "deploybot-mcp" ] } diff --git a/examples/github-workflow.yml b/examples/github-workflow.yml index b42e735..1d89dce 100644 --- a/examples/github-workflow.yml +++ b/examples/github-workflow.yml @@ -60,4 +60,4 @@ jobs: ref: ${{ github.event.repository.default_branch }} persist-credentials: false # v0.2.13 implementation; keep the full commit for privileged workflows. - - uses: Forward-Future/DeployBot@65cb87a83469008179c75e13c39414d0bda611ad + - uses: Forward-Future/DeployBot@14a838f363c98a2a2918a52aa1225d880519f7c0 diff --git a/tests/test_skill.py b/tests/test_skill.py index 9852da3..fc986e4 100644 --- a/tests/test_skill.py +++ b/tests/test_skill.py @@ -8,7 +8,7 @@ ROOT = Path(__file__).resolve().parents[1] CANONICAL = ROOT / "skills" / "deploybot" / "SKILL.md" -RELEASE_COMMIT = "65cb87a83469008179c75e13c39414d0bda611ad" +RELEASE_COMMIT = "14a838f363c98a2a2918a52aa1225d880519f7c0" CHECKOUT_COMMIT = "9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0" From 992048a90e1db410b9197fe056c591d91c1b2019 Mon Sep 17 00:00:00 2001 From: Matthew Berman <748450+mberman84@users.noreply.github.com> Date: Sun, 21 Jun 2026 12:00:07 -0700 Subject: [PATCH 09/11] Guard compensating pause ownership --- src/agent_merge_queue/cli.py | 9 +++-- src/agent_merge_queue/records.py | 9 +++++ tests/test_cli.py | 56 ++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 2 deletions(-) diff --git a/src/agent_merge_queue/cli.py b/src/agent_merge_queue/cli.py index 511bab2..e2180a6 100755 --- a/src/agent_merge_queue/cli.py +++ b/src/agent_merge_queue/cli.py @@ -1412,6 +1412,7 @@ def set_pipeline_control( reason: str | None = None, *, main_sha: str | None = None, + requires_control_id: str | None = None, resumes_control_id: str | None = None, ) -> str: number = self.registry_issue_number(create=True) @@ -1425,6 +1426,7 @@ def set_pipeline_control( control_id=control_id, reason=reason, main_sha=(main_sha or self.base_sha()) if state == "paused" else None, + requires_control_id=requires_control_id, resumes_control_id=resumes_control_id, ), ) @@ -5123,11 +5125,13 @@ def command_unpause( raise QueueError( f"DeployBot main advanced from {main_sha} to {current_sha}; refresh status" ) - client.set_pipeline_control("running", None, resumes_control_id=control_id) + resume_control_id = client.set_pipeline_control( + "running", None, resumes_control_id=control_id + ) refreshed = client.pipeline_control() if refreshed.get("state") != "running" or ( refreshed.get("resumes_control_id") != control_id - ): + ) or refreshed.get("control_id") != resume_control_id: raise QueueError("DeployBot pause changed during unpause; refresh status") refreshed_main = client.base_sha() if refreshed_main != main_sha: @@ -5139,6 +5143,7 @@ def command_unpause( "paused", f"main advanced during unpause from {main_sha} to {refreshed_main}", main_sha=refreshed_main, + requires_control_id=resume_control_id, ) raise QueueError( f"DeployBot main advanced from {main_sha} to {refreshed_main} " diff --git a/src/agent_merge_queue/records.py b/src/agent_merge_queue/records.py index 6c682f0..dcaf7e0 100644 --- a/src/agent_merge_queue/records.py +++ b/src/agent_merge_queue/records.py @@ -148,6 +148,12 @@ def latest_control( state: dict[str, Any] = {"state": "running"} for _, value in sorted(found, key=lambda item: item[0]): if value.get("state") == "paused": + requires_control_id = str(value.get("requires_control_id") or "") + if requires_control_id and not ( + state.get("state") == "running" + and state.get("control_id") == requires_control_id + ): + continue state = value continue if value.get("state") != "running": @@ -531,6 +537,7 @@ def control_body( control_id: str, reason: str | None = None, main_sha: str | None = None, + requires_control_id: str | None = None, resumes_control_id: str | None = None, ) -> str: if state not in {"running", "paused"}: @@ -545,6 +552,8 @@ def control_body( payload["reason"] = reason if main_sha: payload["main_sha"] = main_sha + if requires_control_id: + payload["requires_control_id"] = requires_control_id if resumes_control_id: payload["resumes_control_id"] = resumes_control_id return marker_body(CONTROL_PREFIX, payload, "Recorded DeployBot pipeline control.") diff --git a/tests/test_cli.py b/tests/test_cli.py index 12f3491..d1d119c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -4247,10 +4247,12 @@ def test_unpause_compare_and_sets_matching_failed_release(self) -> None: }, { "state": "running", + "control_id": "resume-1", "resumes_control_id": control_id, }, ] client.base_sha.return_value = sha + client.set_pipeline_control.return_value = "resume-1" with redirect_stdout(io.StringIO()): command_unpause( @@ -4318,6 +4320,7 @@ def test_unpause_rejects_new_pause_won_during_transition(self) -> None: }, ] client.base_sha.return_value = sha + client.set_pipeline_control.return_value = "resume-1" with self.assertRaisesRegex(QueueError, "changed during unpause"): command_unpause(client, main_sha=sha, control_id="pause-1") @@ -4338,14 +4341,17 @@ def test_unpause_repauses_when_main_advances_during_transition(self) -> None: }, { "state": "running", + "control_id": "resume-1", "resumes_control_id": "pause-1", }, { "state": "running", + "control_id": "resume-1", "resumes_control_id": "pause-1", }, ] client.base_sha.side_effect = [sha, newer] + client.set_pipeline_control.side_effect = ["resume-1", "compensation-1"] with self.assertRaisesRegex(QueueError, "pipeline remains paused"): command_unpause(client, main_sha=sha, control_id="pause-1") @@ -4358,6 +4364,7 @@ def test_unpause_repauses_when_main_advances_during_transition(self) -> None: "paused", f"main advanced during unpause from {sha} to {newer}", main_sha=newer, + requires_control_id="resume-1", ), ], ) @@ -4374,6 +4381,7 @@ def test_unpause_preserves_newer_pause_when_main_advances(self) -> None: }, { "state": "running", + "control_id": "resume-1", "resumes_control_id": "pause-1", }, { @@ -4383,6 +4391,7 @@ def test_unpause_preserves_newer_pause_when_main_advances(self) -> None: }, ] client.base_sha.side_effect = [sha, newer] + client.set_pipeline_control.return_value = "resume-1" with self.assertRaisesRegex(QueueError, "pipeline remains paused"): command_unpause(client, main_sha=sha, control_id="pause-1") @@ -4448,6 +4457,53 @@ def test_pipeline_control_ignores_stale_resume_after_new_pause(self) -> None: self.assertEqual(control["state"], "paused") self.assertEqual(control["control_id"], "pause-2") + def test_pipeline_control_ignores_stale_conditional_repause(self) -> None: + sha = "a" * 40 + client = object.__new__(GitHub) + client.coordinator_logins = {"coordinator"} + records = [ + control_body( + state="paused", + control_id="pause-1", + reason=f"ci-failed on {sha}", + main_sha=sha, + ), + control_body( + state="running", + control_id="resume-1", + resumes_control_id="pause-1", + ), + control_body( + state="paused", + control_id="pause-2", + reason=f"deploy-failed on {sha}", + main_sha=sha, + ), + control_body( + state="paused", + control_id="compensation-1", + reason=f"main advanced from {sha}", + main_sha=sha, + requires_control_id="resume-1", + ), + ] + client.registry_comments = Mock( + return_value=[ + { + "id": index, + "created_at": f"2026-06-21T17:17:{index:02d}Z", + "user": {"login": "coordinator"}, + "body": body, + } + for index, body in enumerate(records, start=1) + ] + ) + + control = client.pipeline_control() + + self.assertEqual(control["state"], "paused") + self.assertEqual(control["control_id"], "pause-2") + def test_pipeline_control_migrates_legacy_pause_with_comment_identity(self) -> None: sha = "a" * 40 client = object.__new__(GitHub) From e0865f88bd8eb4acbd2d19abe584f523f00a2ecb Mon Sep 17 00:00:00 2001 From: Matthew Berman <748450+mberman84@users.noreply.github.com> Date: Sun, 21 Jun 2026 12:00:18 -0700 Subject: [PATCH 10/11] Pin final DeployBot v0.2.13 runtime --- README.md | 4 ++-- adapters/claude-code/.mcp.json | 2 +- adapters/cursor/.cursor/mcp.json | 2 +- examples/github-workflow.yml | 2 +- tests/test_skill.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 541d2b7..62a1849 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Install the reviewed `v0.2.13` source commit directly from GitHub: ```bash python3 -m pip install \ - 'deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@14a838f363c98a2a2918a52aa1225d880519f7c0' + 'deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@992048a4eab013952526fd9785678b2a8bb9d772' deploybot init ``` @@ -95,7 +95,7 @@ worker can dispatch deployment when GitHub suppresses the `workflow_run` event for token-dispatched CI. Pin the Action to the full reviewed release commit: ```yaml -- uses: Forward-Future/DeployBot@14a838f363c98a2a2918a52aa1225d880519f7c0 +- uses: Forward-Future/DeployBot@992048a4eab013952526fd9785678b2a8bb9d772 ``` The Action uses GitHub's built-in workflow token. GitHub intentionally does not diff --git a/adapters/claude-code/.mcp.json b/adapters/claude-code/.mcp.json index 35c62b6..b43267f 100644 --- a/adapters/claude-code/.mcp.json +++ b/adapters/claude-code/.mcp.json @@ -4,7 +4,7 @@ "command": "uvx", "args": [ "--from", - "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@14a838f363c98a2a2918a52aa1225d880519f7c0", + "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@992048a4eab013952526fd9785678b2a8bb9d772", "deploybot-mcp" ] } diff --git a/adapters/cursor/.cursor/mcp.json b/adapters/cursor/.cursor/mcp.json index 35c62b6..b43267f 100644 --- a/adapters/cursor/.cursor/mcp.json +++ b/adapters/cursor/.cursor/mcp.json @@ -4,7 +4,7 @@ "command": "uvx", "args": [ "--from", - "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@14a838f363c98a2a2918a52aa1225d880519f7c0", + "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@992048a4eab013952526fd9785678b2a8bb9d772", "deploybot-mcp" ] } diff --git a/examples/github-workflow.yml b/examples/github-workflow.yml index 1d89dce..dc0180a 100644 --- a/examples/github-workflow.yml +++ b/examples/github-workflow.yml @@ -60,4 +60,4 @@ jobs: ref: ${{ github.event.repository.default_branch }} persist-credentials: false # v0.2.13 implementation; keep the full commit for privileged workflows. - - uses: Forward-Future/DeployBot@14a838f363c98a2a2918a52aa1225d880519f7c0 + - uses: Forward-Future/DeployBot@992048a4eab013952526fd9785678b2a8bb9d772 diff --git a/tests/test_skill.py b/tests/test_skill.py index fc986e4..52cc1f3 100644 --- a/tests/test_skill.py +++ b/tests/test_skill.py @@ -8,7 +8,7 @@ ROOT = Path(__file__).resolve().parents[1] CANONICAL = ROOT / "skills" / "deploybot" / "SKILL.md" -RELEASE_COMMIT = "14a838f363c98a2a2918a52aa1225d880519f7c0" +RELEASE_COMMIT = "992048a4eab013952526fd9785678b2a8bb9d772" CHECKOUT_COMMIT = "9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0" From 161734a9eb712d30770425fa969f13bcf1be5e0a Mon Sep 17 00:00:00 2001 From: Matthew Berman <748450+mberman84@users.noreply.github.com> Date: Sun, 21 Jun 2026 12:00:25 -0700 Subject: [PATCH 11/11] Correct v0.2.13 runtime pin --- README.md | 4 ++-- adapters/claude-code/.mcp.json | 2 +- adapters/cursor/.cursor/mcp.json | 2 +- examples/github-workflow.yml | 2 +- tests/test_skill.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 62a1849..71488ae 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Install the reviewed `v0.2.13` source commit directly from GitHub: ```bash python3 -m pip install \ - 'deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@992048a4eab013952526fd9785678b2a8bb9d772' + 'deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@992048a90e1db410b9197fe056c591d91c1b2019' deploybot init ``` @@ -95,7 +95,7 @@ worker can dispatch deployment when GitHub suppresses the `workflow_run` event for token-dispatched CI. Pin the Action to the full reviewed release commit: ```yaml -- uses: Forward-Future/DeployBot@992048a4eab013952526fd9785678b2a8bb9d772 +- uses: Forward-Future/DeployBot@992048a90e1db410b9197fe056c591d91c1b2019 ``` The Action uses GitHub's built-in workflow token. GitHub intentionally does not diff --git a/adapters/claude-code/.mcp.json b/adapters/claude-code/.mcp.json index b43267f..7c4e77e 100644 --- a/adapters/claude-code/.mcp.json +++ b/adapters/claude-code/.mcp.json @@ -4,7 +4,7 @@ "command": "uvx", "args": [ "--from", - "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@992048a4eab013952526fd9785678b2a8bb9d772", + "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@992048a90e1db410b9197fe056c591d91c1b2019", "deploybot-mcp" ] } diff --git a/adapters/cursor/.cursor/mcp.json b/adapters/cursor/.cursor/mcp.json index b43267f..7c4e77e 100644 --- a/adapters/cursor/.cursor/mcp.json +++ b/adapters/cursor/.cursor/mcp.json @@ -4,7 +4,7 @@ "command": "uvx", "args": [ "--from", - "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@992048a4eab013952526fd9785678b2a8bb9d772", + "deploybot-merge-queue[mcp] @ git+https://github.com/Forward-Future/DeployBot.git@992048a90e1db410b9197fe056c591d91c1b2019", "deploybot-mcp" ] } diff --git a/examples/github-workflow.yml b/examples/github-workflow.yml index dc0180a..af2d2a6 100644 --- a/examples/github-workflow.yml +++ b/examples/github-workflow.yml @@ -60,4 +60,4 @@ jobs: ref: ${{ github.event.repository.default_branch }} persist-credentials: false # v0.2.13 implementation; keep the full commit for privileged workflows. - - uses: Forward-Future/DeployBot@992048a4eab013952526fd9785678b2a8bb9d772 + - uses: Forward-Future/DeployBot@992048a90e1db410b9197fe056c591d91c1b2019 diff --git a/tests/test_skill.py b/tests/test_skill.py index 52cc1f3..aa83835 100644 --- a/tests/test_skill.py +++ b/tests/test_skill.py @@ -8,7 +8,7 @@ ROOT = Path(__file__).resolve().parents[1] CANONICAL = ROOT / "skills" / "deploybot" / "SKILL.md" -RELEASE_COMMIT = "992048a4eab013952526fd9785678b2a8bb9d772" +RELEASE_COMMIT = "992048a90e1db410b9197fe056c591d91c1b2019" CHECKOUT_COMMIT = "9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0"