From 6d05c45383265fdb4157c359fa7737fb565798df Mon Sep 17 00:00:00 2001 From: Hamish Fagg Date: Thu, 11 Jun 2026 17:44:21 +1200 Subject: [PATCH 01/12] fix pr envs --- argocd-pr-env-deploy/action.yml | 58 ++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/argocd-pr-env-deploy/action.yml b/argocd-pr-env-deploy/action.yml index c417cc6..7c943ef 100644 --- a/argocd-pr-env-deploy/action.yml +++ b/argocd-pr-env-deploy/action.yml @@ -56,14 +56,20 @@ runs: OWN_REPO="$GITHUB_REPOSITORY" OWN_SHORT="${OWN_REPO##*/}" PR_NUMBER="$(jq -r .pull_request.number "$GITHUB_EVENT_PATH")" - # IMPORTANT: $GITHUB_SHA (not .pull_request.head.sha). On pull_request - # events GitHub sets $GITHUB_SHA to the synthetic *merge commit* SHA - # (PR head merged into base, for build validation). build-push-ecr - # tags images as `development-${GITHUB_SHA}`, so the deploy must - # reference the same SHA — `.pull_request.head.sha` (PR branch tip) - # is a *different* commit on every PR and produces tags that don't - # exist in ECR. - OWN_SHA="$GITHUB_SHA" + + # Two SHAs, used for two different things: + # $GITHUB_SHA = merge SHA (PR head merged into base, a + # synthetic commit only reachable via + # refs/pull/N/merge). build-push-ecr tags + # images as `development-${GITHUB_SHA}`, + # so the image-tag override uses this. + # .pull_request.head.sha = PR branch tip — the commit ArgoCD's + # repo-server can actually fetch as a + # `targetRevision`. Used for the chart + # revision so chart changes on the PR + # branch deploy with the PR. + OWN_MERGE_SHA="$GITHUB_SHA" + OWN_HEAD_SHA="$(jq -r .pull_request.head.sha "$GITHUB_EVENT_PATH")" PARENT_APP="pr-${OWN_SHORT}-${PR_NUMBER}" # GitHub REST helper — auth + sensible headers in one place. No `gh` @@ -74,11 +80,23 @@ runs: "https://api.github.com$1" } - # Build `--helm-set tags.=development-` for own + any linked - # PRs in the anchor body. Self-links and unresolvable PRs are skipped. - set_args=( --helm-set "tags.${OWN_SHORT}=development-${OWN_SHA}" ) - while IFS=: read -r repo sha; do - set_args+=( --helm-set "tags.${repo}=development-${sha}" ) + # Build the override list — for each known PR (own + every linked + # one) we emit two helm-set args: + # tags.=development- image tag (matches ECR push) + # revisions.= chart git revision (ArgoCD + # fetches the chart at this + # commit; pr-branch chart + # changes deploy with the PR) + # Self-links and unresolvable PRs are skipped silently. + set_args=( + --helm-set "tags.${OWN_SHORT}=development-${OWN_MERGE_SHA}" + --helm-set "revisions.${OWN_SHORT}=${OWN_HEAD_SHA}" + ) + while IFS=: read -r repo merge_sha head_sha; do + set_args+=( + --helm-set "tags.${repo}=development-${merge_sha}" + --helm-set "revisions.${repo}=${head_sha}" + ) done < <( gh_api "/repos/$OWN_REPO/pulls/$PR_NUMBER" \ | jq -r '.body // ""' \ @@ -86,12 +104,14 @@ runs: | while read -r _kw ref; do full="${ref%%#*}"; num="${ref##*#}" [ "$full" = "$OWN_REPO" ] && continue - # .merge_commit_sha, not .head.sha — the linked PR's build - # also ran under $GITHUB_SHA (the merge SHA), so that's what - # the ECR tag will be. merge_commit_sha is null when the PR - # is unmergeable (conflicts); we skip those silently. - sha="$(gh_api "/repos/$full/pulls/$num" | jq -r '.merge_commit_sha // empty')" || continue - [ -n "$sha" ] && echo "${full##*/}:${sha}" + # One API call, both SHAs out. merge_commit_sha is null on + # unmergeable PRs; we skip those. + read -r merge_sha head_sha < <( + gh_api "/repos/$full/pulls/$num" \ + | jq -r '[.merge_commit_sha // "", .head.sha // ""] | @tsv' + ) || continue + [ -n "$merge_sha" ] && [ -n "$head_sha" ] \ + && echo "${full##*/}:${merge_sha}:${head_sha}" done ) From 4bdc79bbda78baf976f7cb6c630ccca7c7e0e284 Mon Sep 17 00:00:00 2001 From: Hamish Fagg Date: Fri, 12 Jun 2026 12:43:49 +1200 Subject: [PATCH 02/12] update --- argocd-pr-env-deploy/action.yml | 40 ++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/argocd-pr-env-deploy/action.yml b/argocd-pr-env-deploy/action.yml index 7c943ef..ca67184 100644 --- a/argocd-pr-env-deploy/action.yml +++ b/argocd-pr-env-deploy/action.yml @@ -2,8 +2,9 @@ # # Resolves: # - This PR (anchor) — own head SHA, repo short name from $GITHUB_REPOSITORY. -# - Any `Deploys: org/repo#N` line in the anchor PR body — looks up each -# linked PR's head SHA via the GitHub REST API. +# - Any `Deploys: repo#N` or `Deploys: org/repo#N` line in the anchor PR +# body (owner defaults to this repo's org) — looks up each linked PR's +# head SHA via the GitHub REST API. # # Issues a single `argocd app set` against the parent Application # (`pr--`) with `--helm-set tags.=development-` per @@ -87,7 +88,9 @@ runs: # fetches the chart at this # commit; pr-branch chart # changes deploy with the PR) - # Self-links and unresolvable PRs are skipped silently. + # Links accept `Deploys: repo#N` (owner defaults to this repo's org) + # or `Deploys: org/repo#N`. Self-links are skipped; unresolvable PRs + # are skipped with a warning. set_args=( --helm-set "tags.${OWN_SHORT}=development-${OWN_MERGE_SHA}" --helm-set "revisions.${OWN_SHORT}=${OWN_HEAD_SHA}" @@ -103,15 +106,23 @@ runs: | grep -oE 'Deploys:[[:space:]]+[^#[:space:]]+#[0-9]+' \ | while read -r _kw ref; do full="${ref%%#*}"; num="${ref##*#}" + case "$full" in + */*) ;; + *) full="${OWN_REPO%%/*}/$full" ;; + esac [ "$full" = "$OWN_REPO" ] && continue # One API call, both SHAs out. merge_commit_sha is null on # unmergeable PRs; we skip those. + merge_sha=""; head_sha="" read -r merge_sha head_sha < <( gh_api "/repos/$full/pulls/$num" \ | jq -r '[.merge_commit_sha // "", .head.sha // ""] | @tsv' - ) || continue - [ -n "$merge_sha" ] && [ -n "$head_sha" ] \ - && echo "${full##*/}:${merge_sha}:${head_sha}" + ) || true + if [ -n "${merge_sha:-}" ] && [ -n "${head_sha:-}" ]; then + echo "${full##*/}:${merge_sha}:${head_sha}" + else + echo "::warning::Deploys link ${full}#${num} could not be resolved (PR missing or unmergeable); skipping" >&2 + fi done ) @@ -132,5 +143,22 @@ runs: # accepts username/password/sso), and the ARGOCD_SERVER + # ARGOCD_AUTH_TOKEN env vars set above are picked up by every # subsequent argocd command directly. + + # The parent Application is created by the pr-environments + # ApplicationSet, which polls GitHub every 60s (requeueAfterSeconds) — + # on a fresh `deploy` label the app may not exist yet when this step + # runs (builds can be cache-fast). Poll before `app set` instead of + # failing hard; 5 minutes is comfortably past the AppSet requeue. + for i in $(seq 1 30); do + "$ARGOCD_BIN" app get "$PARENT_APP" --grpc-web >/dev/null 2>&1 && break + if [ "$i" -eq 30 ]; then + echo "Timed out waiting for Application ${PARENT_APP} to be created by the pr-environments ApplicationSet." >&2 + echo "Check the PR carries the 'deploy' label and the ApplicationSet controller is healthy." >&2 + exit 1 + fi + echo "Application ${PARENT_APP} not found yet (attempt ${i}/30); retrying in 10s..." + sleep 10 + done + echo "argocd app set ${PARENT_APP} ${set_args[*]}" "$ARGOCD_BIN" app set "$PARENT_APP" --grpc-web "${set_args[@]}" From 8c34ab58655dbfe2eae5d8759ce3a2fa0ab9e499 Mon Sep 17 00:00:00 2001 From: Hamish Fagg Date: Fri, 12 Jun 2026 12:55:19 +1200 Subject: [PATCH 03/12] fix --- argocd-pr-env-deploy/action.yml | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/argocd-pr-env-deploy/action.yml b/argocd-pr-env-deploy/action.yml index ca67184..9e20f70 100644 --- a/argocd-pr-env-deploy/action.yml +++ b/argocd-pr-env-deploy/action.yml @@ -150,13 +150,29 @@ runs: # runs (builds can be cache-fast). Poll before `app set` instead of # failing hard; 5 minutes is comfortably past the AppSet requeue. for i in $(seq 1 30); do - "$ARGOCD_BIN" app get "$PARENT_APP" --grpc-web >/dev/null 2>&1 && break + if get_err="$("$ARGOCD_BIN" app get "$PARENT_APP" --grpc-web 2>&1 >/dev/null)"; then + break + fi + # ArgoCD reports a non-existent app as PermissionDenied for + # API-key accounts (existence is hidden), so both NotFound and + # PermissionDenied mean "keep waiting". Anything else (bad token, + # TLS/connection errors, version mismatch) won't fix itself — + # fail immediately with the real error instead of looping. + case "$get_err" in + *PermissionDenied*|*NotFound*|*"not found"*|*"permission denied"*) ;; + *) + echo "argocd app get ${PARENT_APP} failed with an unexpected error:" >&2 + echo "$get_err" >&2 + exit 1 + ;; + esac if [ "$i" -eq 30 ]; then echo "Timed out waiting for Application ${PARENT_APP} to be created by the pr-environments ApplicationSet." >&2 + echo "Last error: $get_err" >&2 echo "Check the PR carries the 'deploy' label and the ApplicationSet controller is healthy." >&2 exit 1 fi - echo "Application ${PARENT_APP} not found yet (attempt ${i}/30); retrying in 10s..." + echo "Application ${PARENT_APP} not available yet (attempt ${i}/30): $get_err" sleep 10 done From 4778b9960f9060ccd1e730fd57035c718287c869 Mon Sep 17 00:00:00 2001 From: Hamish Fagg Date: Fri, 12 Jun 2026 14:06:24 +1200 Subject: [PATCH 04/12] fix --- argocd-pr-env-deploy/action.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/argocd-pr-env-deploy/action.yml b/argocd-pr-env-deploy/action.yml index 9e20f70..f8e9f50 100644 --- a/argocd-pr-env-deploy/action.yml +++ b/argocd-pr-env-deploy/action.yml @@ -178,3 +178,20 @@ runs: echo "argocd app set ${PARENT_APP} ${set_args[*]}" "$ARGOCD_BIN" app set "$PARENT_APP" --grpc-web "${set_args[@]}" + + # A sync operation may be in flight — e.g. the parent's very first + # automated sync, or a rollout from a previous commit stuck waiting on + # an unhealthy wave. `app set` updates the spec but does NOT interrupt + # a running operation, and ArgoCD won't start the auto-sync carrying + # the new parameters until the current operation ends — which it never + # does if a wave is wedged (e.g. ImagePullBackOff). Terminate it; the + # terminated operation ran with the OLD parameters, so automated sync + # immediately starts a fresh one with the parameters set above. When + # nothing is running the CLI errors ("Unable to terminate operation. + # No operation is in progress"), which is the normal case and fine. + # (TerminateOperation requires the `applications, sync` RBAC grant.) + if term_err="$("$ARGOCD_BIN" app terminate-op "$PARENT_APP" --grpc-web 2>&1)"; then + echo "Terminated in-flight sync on ${PARENT_APP}; automated sync will restart with the new parameters." + else + echo "No in-flight sync to terminate on ${PARENT_APP} (${term_err})." + fi From 7be5a9c32df05129894e74536f90e52c1f9928c8 Mon Sep 17 00:00:00 2001 From: Hamish Fagg Date: Tue, 16 Jun 2026 11:41:15 +1200 Subject: [PATCH 05/12] fix --- argocd-pr-env-deploy/action.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/argocd-pr-env-deploy/action.yml b/argocd-pr-env-deploy/action.yml index f8e9f50..27462d0 100644 --- a/argocd-pr-env-deploy/action.yml +++ b/argocd-pr-env-deploy/action.yml @@ -195,3 +195,12 @@ runs: else echo "No in-flight sync to terminate on ${PARENT_APP} (${term_err})." fi + + # Explicitly request a sync rather than waiting for the application + # controller's reconciliation poll to notice the spec change. The CLI + # call returns once the operation has been *queued* (--async); we + # don't wait for the full rollout here because parent syncs are + # wave-gated and can take minutes — that's argocd's job to drive + # from this point on. Failing to queue is fatal (likely RBAC). + echo "argocd app sync ${PARENT_APP} --async" + "$ARGOCD_BIN" app sync "$PARENT_APP" --grpc-web --async From 06f889ca0f9d51e7aa4fd3345fda9dab8ee266cd Mon Sep 17 00:00:00 2001 From: Hamish Fagg Date: Tue, 16 Jun 2026 15:26:12 +1200 Subject: [PATCH 06/12] fix --- argocd-pr-env-deploy/action.yml | 57 ++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/argocd-pr-env-deploy/action.yml b/argocd-pr-env-deploy/action.yml index 27462d0..20155f2 100644 --- a/argocd-pr-env-deploy/action.yml +++ b/argocd-pr-env-deploy/action.yml @@ -149,30 +149,51 @@ runs: # on a fresh `deploy` label the app may not exist yet when this step # runs (builds can be cache-fast). Poll before `app set` instead of # failing hard; 5 minutes is comfortably past the AppSet requeue. + # Two states make this poll non-trivial: + # 1. App doesn't exist yet — ApplicationSet hasn't run its requeue + # cycle (60s) and created it. `argocd app get` returns NotFound + # (or PermissionDenied — API-key accounts can't tell existence). + # 2. App exists but is being DELETED — happens when the user removed + # the `deploy` label then re-added it: the AppSet starts deleting + # the old Application before recreating, and `argocd app get` + # returns the deleting object successfully. If we proceed, + # `argocd app sync` fails with FailedPrecondition: application + # is deleting. Detect that via `.metadata.deletionTimestamp` on + # the JSON output and treat it the same as "not ready yet". + # 5 minutes is comfortably past both the AppSet requeue and the + # typical Application finaliser cleanup. for i in $(seq 1 30); do - if get_err="$("$ARGOCD_BIN" app get "$PARENT_APP" --grpc-web 2>&1 >/dev/null)"; then - break + if get_out="$("$ARGOCD_BIN" app get "$PARENT_APP" --grpc-web -o json 2>&1)"; then + # App exists. Is it being deleted? If so, keep waiting until the + # new one (which won't have a deletionTimestamp) comes up. + del_ts="$(printf '%s' "$get_out" | jq -r '.metadata.deletionTimestamp // empty' 2>/dev/null || true)" + if [ -z "${del_ts:-}" ]; then + break + fi + get_err="application is being deleted (deletionTimestamp=${del_ts}); waiting for ApplicationSet to recreate" + else + get_err="$get_out" + # ArgoCD reports a non-existent app as PermissionDenied for + # API-key accounts (existence is hidden), so both NotFound and + # PermissionDenied mean "keep waiting". Anything else (bad token, + # TLS/connection errors, version mismatch) won't fix itself — + # fail immediately with the real error instead of looping. + case "$get_err" in + *PermissionDenied*|*NotFound*|*"not found"*|*"permission denied"*) ;; + *) + echo "argocd app get ${PARENT_APP} failed with an unexpected error:" >&2 + echo "$get_err" >&2 + exit 1 + ;; + esac fi - # ArgoCD reports a non-existent app as PermissionDenied for - # API-key accounts (existence is hidden), so both NotFound and - # PermissionDenied mean "keep waiting". Anything else (bad token, - # TLS/connection errors, version mismatch) won't fix itself — - # fail immediately with the real error instead of looping. - case "$get_err" in - *PermissionDenied*|*NotFound*|*"not found"*|*"permission denied"*) ;; - *) - echo "argocd app get ${PARENT_APP} failed with an unexpected error:" >&2 - echo "$get_err" >&2 - exit 1 - ;; - esac if [ "$i" -eq 30 ]; then - echo "Timed out waiting for Application ${PARENT_APP} to be created by the pr-environments ApplicationSet." >&2 - echo "Last error: $get_err" >&2 + echo "Timed out waiting for Application ${PARENT_APP} to be ready." >&2 + echo "Last status: $get_err" >&2 echo "Check the PR carries the 'deploy' label and the ApplicationSet controller is healthy." >&2 exit 1 fi - echo "Application ${PARENT_APP} not available yet (attempt ${i}/30): $get_err" + echo "Application ${PARENT_APP} not ready (attempt ${i}/30): $get_err" sleep 10 done From 537a5e98220bb5f957f8e43bc3b568a99814afb5 Mon Sep 17 00:00:00 2001 From: Hamish Fagg Date: Tue, 16 Jun 2026 15:29:00 +1200 Subject: [PATCH 07/12] fix --- argocd-pr-env-deploy/action.yml | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/argocd-pr-env-deploy/action.yml b/argocd-pr-env-deploy/action.yml index 20155f2..ba0c7d1 100644 --- a/argocd-pr-env-deploy/action.yml +++ b/argocd-pr-env-deploy/action.yml @@ -222,6 +222,24 @@ runs: # call returns once the operation has been *queued* (--async); we # don't wait for the full rollout here because parent syncs are # wave-gated and can take minutes — that's argocd's job to drive - # from this point on. Failing to queue is fatal (likely RBAC). + # from this point on. + # + # FailedPrecondition: "another operation is already in progress" + # means automated.sync ALREADY queued an op in response to our + # `app set` (which happens within ~ms when automated sync is on). + # That's the success case for us — bail with a friendly log, not + # exit 1. Any other failure (RBAC, connectivity) still fatals. echo "argocd app sync ${PARENT_APP} --async" - "$ARGOCD_BIN" app sync "$PARENT_APP" --grpc-web --async + if sync_err="$("$ARGOCD_BIN" app sync "$PARENT_APP" --grpc-web --async 2>&1)"; then + echo "Sync queued." + else + case "$sync_err" in + *"another operation is already in progress"*) + echo "Sync was already in progress (automated.sync picked up the app set change); nothing to do." + ;; + *) + echo "$sync_err" >&2 + exit 1 + ;; + esac + fi From 7bf97f7b3af4a7f5c1730171fee2aac55b53f7aa Mon Sep 17 00:00:00 2001 From: Hamish Fagg Date: Thu, 18 Jun 2026 16:30:04 +1200 Subject: [PATCH 08/12] wait --- argocd-pr-env-deploy/action.yml | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/argocd-pr-env-deploy/action.yml b/argocd-pr-env-deploy/action.yml index ba0c7d1..b824a76 100644 --- a/argocd-pr-env-deploy/action.yml +++ b/argocd-pr-env-deploy/action.yml @@ -8,8 +8,11 @@ # # Issues a single `argocd app set` against the parent Application # (`pr--`) with `--helm-set tags.=development-` per -# resolved repo. ArgoCD's automated sync + the sync-wave annotations on the -# parent's children handle the rollout — no `argocd app sync` is issued here. +# resolved repo, kicks the auto-sync via `argocd app sync --async`, and then +# blocks on `argocd app wait --health` (up to 30m) so the GH job stays open +# until the env is actually up. The GH Deployment sidebar entry on the PR +# is driven by the caller workflow's `environment:` block (native GH), not +# by this action — its job is solely to deploy and gate on Healthy. # # Must run from a `pull_request` event; PR number and head SHA are read from # $GITHUB_EVENT_PATH so callers don't have to pass them. @@ -243,3 +246,12 @@ runs: ;; esac fi + + # Block until the parent reports Healthy, or fail the job after 30m. + # The parent is wave-gated, so Healthy implies every child rolled + # out. While we wait the GH Actions job log stays attached so devs + # can tail progress, and the job's exit code drives the native GH + # Deployment status (set via the caller workflow's `environment:` + # block): success → green "deployed" pill, failure → red. + echo "argocd app wait ${PARENT_APP} --health --timeout 1800" + "$ARGOCD_BIN" app wait "$PARENT_APP" --grpc-web --health --timeout 1800 From adeb0c94574aeb9a3a17c4a8a144240daa7af482 Mon Sep 17 00:00:00 2001 From: Hamish Fagg Date: Thu, 18 Jun 2026 16:39:32 +1200 Subject: [PATCH 09/12] fix wait --- argocd-pr-env-deploy/action.yml | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/argocd-pr-env-deploy/action.yml b/argocd-pr-env-deploy/action.yml index b824a76..c4f6aff 100644 --- a/argocd-pr-env-deploy/action.yml +++ b/argocd-pr-env-deploy/action.yml @@ -247,11 +247,15 @@ runs: esac fi - # Block until the parent reports Healthy, or fail the job after 30m. - # The parent is wave-gated, so Healthy implies every child rolled - # out. While we wait the GH Actions job log stays attached so devs - # can tail progress, and the job's exit code drives the native GH - # Deployment status (set via the caller workflow's `environment:` - # block): success → green "deployed" pill, failure → red. - echo "argocd app wait ${PARENT_APP} --health --timeout 1800" - "$ARGOCD_BIN" app wait "$PARENT_APP" --grpc-web --health --timeout 1800 + # Block until the whole env reaches Synced+Healthy with no in-flight + # sync, or fail the job after 30m. We wait on the LABEL SET (parent + # + every child Application), not just the parent: the parent's + # status.health.status is Healthy as soon as its rendered children + # exist, even when those children are still OutOfSync from the spec + # we just `app set`. Without `--sync` and `--operation`, wait would + # return immediately on the old pods' Healthy state. The label is + # written by the AppSet template onto the parent and by `prenv.labels` + # onto every child, so this one selector covers both. + WAIT_SELECTOR="pr-env.mindsdb.com/anchor-repo=${OWN_SHORT},pr-env.mindsdb.com/pr-number=${PR_NUMBER}" + echo "argocd app wait -l ${WAIT_SELECTOR} --sync --health --operation --timeout 1800" + "$ARGOCD_BIN" app wait -l "$WAIT_SELECTOR" --grpc-web --sync --health --operation --timeout 1800 From 6aeb38c8a3761d86be961c9444287bf231a4b6c3 Mon Sep 17 00:00:00 2001 From: Hamish Fagg Date: Fri, 19 Jun 2026 10:11:55 +1200 Subject: [PATCH 10/12] add commit status --- argocd-pr-env-deploy/action.yml | 52 ++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 7 deletions(-) diff --git a/argocd-pr-env-deploy/action.yml b/argocd-pr-env-deploy/action.yml index c4f6aff..fb8d975 100644 --- a/argocd-pr-env-deploy/action.yml +++ b/argocd-pr-env-deploy/action.yml @@ -9,10 +9,16 @@ # Issues a single `argocd app set` against the parent Application # (`pr--`) with `--helm-set tags.=development-` per # resolved repo, kicks the auto-sync via `argocd app sync --async`, and then -# blocks on `argocd app wait --health` (up to 30m) so the GH job stays open -# until the env is actually up. The GH Deployment sidebar entry on the PR -# is driven by the caller workflow's `environment:` block (native GH), not -# by this action — its job is solely to deploy and gate on Healthy. +# blocks on `argocd app wait` (up to 30m) so the GH job stays open until +# the env is actually up. +# +# Two surfaces are kept in sync with the rollout state: +# * Native GH Deployment — driven by the caller workflow's `environment:` +# block. Shows in the PR sidebar + repo Environments tab. Status is +# just the job's exit code. +# * Commit status (this file) — one `pr-env/hub` row in the PR's checks +# panel, clickable through to the env hub URL. Posted as `pending` +# before the sync, `success`/`failure` after the wait returns. # # Must run from a `pull_request` event; PR number and head SHA are read from # $GITHUB_EVENT_PATH so callers don't have to pass them. @@ -76,13 +82,38 @@ runs: OWN_HEAD_SHA="$(jq -r .pull_request.head.sha "$GITHUB_EVENT_PATH")" PARENT_APP="pr-${OWN_SHORT}-${PR_NUMBER}" - # GitHub REST helper — auth + sensible headers in one place. No `gh` - # dependency: self-hosted runners may not have it. + # Hub URL — env entry point and the target_url of the commit status + # below. Hub host equals the env name. + HUB_URL="https://pr-${OWN_SHORT}-${PR_NUMBER}.dev.mindshub.ai" + + # GitHub REST helpers (GET + POST) — auth + sensible headers in one + # place. No `gh` dependency: self-hosted runners may not have it. gh_api() { curl -sfH "Authorization: Bearer $GH_TOKEN" \ -H "Accept: application/vnd.github+json" \ "https://api.github.com$1" } + gh_api_post() { + curl -sfH "Authorization: Bearer $GH_TOKEN" \ + -H "Accept: application/vnd.github+json" \ + -H "Content-Type: application/json" \ + -d "$2" \ + "https://api.github.com$1" + } + + # Post a `pr-env/hub` commit status. Renders as a row in the PR's + # checks panel, clickable through to the env hub URL. Called twice: + # `pending` before the sync, then `success`/`failure` after wait. + post_pr_env_status() { + local state="$1" desc="$2" + if ! gh_api_post "/repos/${OWN_REPO}/statuses/${OWN_HEAD_SHA}" \ + "$(jq -nc \ + --arg state "$state" --arg url "$HUB_URL" --arg desc "$desc" \ + '{state:$state, context:"pr-env/hub", target_url:$url, description:$desc}')" \ + >/dev/null; then + echo "::warning::Failed to post commit status pr-env/hub=${state}" + fi + } # Build the override list — for each known PR (own + every linked # one) we emit two helm-set args: @@ -203,6 +234,8 @@ runs: echo "argocd app set ${PARENT_APP} ${set_args[*]}" "$ARGOCD_BIN" app set "$PARENT_APP" --grpc-web "${set_args[@]}" + post_pr_env_status "pending" "Env spinning up" + # A sync operation may be in flight — e.g. the parent's very first # automated sync, or a rollout from a previous commit stuck waiting on # an unhealthy wave. `app set` updates the spec but does NOT interrupt @@ -258,4 +291,9 @@ runs: # onto every child, so this one selector covers both. WAIT_SELECTOR="pr-env.mindsdb.com/anchor-repo=${OWN_SHORT},pr-env.mindsdb.com/pr-number=${PR_NUMBER}" echo "argocd app wait -l ${WAIT_SELECTOR} --sync --health --operation --timeout 1800" - "$ARGOCD_BIN" app wait -l "$WAIT_SELECTOR" --grpc-web --sync --health --operation --timeout 1800 + if "$ARGOCD_BIN" app wait -l "$WAIT_SELECTOR" --grpc-web --sync --health --operation --timeout 1800; then + post_pr_env_status "success" "Ready" + else + post_pr_env_status "failure" "Env did not become Healthy within 30 minutes" + exit 1 + fi From 9a5c2d3876d13f26827f29a10a0786609e3408ae Mon Sep 17 00:00:00 2001 From: Hamish Fagg Date: Fri, 19 Jun 2026 17:14:19 +1200 Subject: [PATCH 11/12] tidy up --- argocd-pr-env-deploy/action.yml | 116 ++++++++------------------------ 1 file changed, 29 insertions(+), 87 deletions(-) diff --git a/argocd-pr-env-deploy/action.yml b/argocd-pr-env-deploy/action.yml index fb8d975..873178e 100644 --- a/argocd-pr-env-deploy/action.yml +++ b/argocd-pr-env-deploy/action.yml @@ -12,13 +12,9 @@ # blocks on `argocd app wait` (up to 30m) so the GH job stays open until # the env is actually up. # -# Two surfaces are kept in sync with the rollout state: -# * Native GH Deployment — driven by the caller workflow's `environment:` -# block. Shows in the PR sidebar + repo Environments tab. Status is -# just the job's exit code. -# * Commit status (this file) — one `pr-env/hub` row in the PR's checks -# panel, clickable through to the env hub URL. Posted as `pending` -# before the sync, `success`/`failure` after the wait returns. +# Rollout state surfaces in the PR via the caller workflow's `environment:` +# block (native GH Deployment). The PR sidebar pill is `in_progress` while +# the job runs and flips to `success`/`failure` on the job's exit code. # # Must run from a `pull_request` event; PR number and head SHA are read from # $GITHUB_EVENT_PATH so callers don't have to pass them. @@ -82,38 +78,13 @@ runs: OWN_HEAD_SHA="$(jq -r .pull_request.head.sha "$GITHUB_EVENT_PATH")" PARENT_APP="pr-${OWN_SHORT}-${PR_NUMBER}" - # Hub URL — env entry point and the target_url of the commit status - # below. Hub host equals the env name. - HUB_URL="https://pr-${OWN_SHORT}-${PR_NUMBER}.dev.mindshub.ai" - - # GitHub REST helpers (GET + POST) — auth + sensible headers in one - # place. No `gh` dependency: self-hosted runners may not have it. + # GitHub REST helper — auth + sensible headers in one place. No `gh` + # dependency: self-hosted runners may not have it. gh_api() { curl -sfH "Authorization: Bearer $GH_TOKEN" \ -H "Accept: application/vnd.github+json" \ "https://api.github.com$1" } - gh_api_post() { - curl -sfH "Authorization: Bearer $GH_TOKEN" \ - -H "Accept: application/vnd.github+json" \ - -H "Content-Type: application/json" \ - -d "$2" \ - "https://api.github.com$1" - } - - # Post a `pr-env/hub` commit status. Renders as a row in the PR's - # checks panel, clickable through to the env hub URL. Called twice: - # `pending` before the sync, then `success`/`failure` after wait. - post_pr_env_status() { - local state="$1" desc="$2" - if ! gh_api_post "/repos/${OWN_REPO}/statuses/${OWN_HEAD_SHA}" \ - "$(jq -nc \ - --arg state "$state" --arg url "$HUB_URL" --arg desc "$desc" \ - '{state:$state, context:"pr-env/hub", target_url:$url, description:$desc}')" \ - >/dev/null; then - echo "::warning::Failed to post commit status pr-env/hub=${state}" - fi - } # Build the override list — for each known PR (own + every linked # one) we emit two helm-set args: @@ -178,64 +149,40 @@ runs: # ARGOCD_AUTH_TOKEN env vars set above are picked up by every # subsequent argocd command directly. - # The parent Application is created by the pr-environments - # ApplicationSet, which polls GitHub every 60s (requeueAfterSeconds) — - # on a fresh `deploy` label the app may not exist yet when this step - # runs (builds can be cache-fast). Poll before `app set` instead of - # failing hard; 5 minutes is comfortably past the AppSet requeue. - # Two states make this poll non-trivial: - # 1. App doesn't exist yet — ApplicationSet hasn't run its requeue - # cycle (60s) and created it. `argocd app get` returns NotFound - # (or PermissionDenied — API-key accounts can't tell existence). - # 2. App exists but is being DELETED — happens when the user removed - # the `deploy` label then re-added it: the AppSet starts deleting - # the old Application before recreating, and `argocd app get` - # returns the deleting object successfully. If we proceed, - # `argocd app sync` fails with FailedPrecondition: application - # is deleting. Detect that via `.metadata.deletionTimestamp` on - # the JSON output and treat it the same as "not ready yet". + # Retry `app set` directly until the parent App is set-able. Three + # transient states are tolerated; anything else is fatal: + # 1. App doesn't exist yet — ApplicationSet hasn't run its 60s + # requeue cycle and created it (NotFound, or PermissionDenied + # for API-key accounts that can't see non-existent apps). + # 2. App exists but is being DELETED — label was removed then + # re-added; AppSet is mid-delete before recreating. `app set` + # returns "application is being deleted". + # 3. Network blip — covered by the same retry. # 5 minutes is comfortably past both the AppSet requeue and the # typical Application finaliser cleanup. + echo "argocd app set ${PARENT_APP} ${set_args[*]}" for i in $(seq 1 30); do - if get_out="$("$ARGOCD_BIN" app get "$PARENT_APP" --grpc-web -o json 2>&1)"; then - # App exists. Is it being deleted? If so, keep waiting until the - # new one (which won't have a deletionTimestamp) comes up. - del_ts="$(printf '%s' "$get_out" | jq -r '.metadata.deletionTimestamp // empty' 2>/dev/null || true)" - if [ -z "${del_ts:-}" ]; then - break - fi - get_err="application is being deleted (deletionTimestamp=${del_ts}); waiting for ApplicationSet to recreate" - else - get_err="$get_out" - # ArgoCD reports a non-existent app as PermissionDenied for - # API-key accounts (existence is hidden), so both NotFound and - # PermissionDenied mean "keep waiting". Anything else (bad token, - # TLS/connection errors, version mismatch) won't fix itself — - # fail immediately with the real error instead of looping. - case "$get_err" in - *PermissionDenied*|*NotFound*|*"not found"*|*"permission denied"*) ;; - *) - echo "argocd app get ${PARENT_APP} failed with an unexpected error:" >&2 - echo "$get_err" >&2 - exit 1 - ;; - esac + if set_err="$("$ARGOCD_BIN" app set "$PARENT_APP" --grpc-web "${set_args[@]}" 2>&1)"; then + break fi + case "$set_err" in + *PermissionDenied*|*NotFound*|*"not found"*|*"permission denied"*|*"being deleted"*) ;; + *) + echo "argocd app set ${PARENT_APP} failed unexpectedly:" >&2 + echo "$set_err" >&2 + exit 1 + ;; + esac if [ "$i" -eq 30 ]; then - echo "Timed out waiting for Application ${PARENT_APP} to be ready." >&2 - echo "Last status: $get_err" >&2 + echo "Timed out waiting for Application ${PARENT_APP} to be set-able." >&2 + echo "Last error: $set_err" >&2 echo "Check the PR carries the 'deploy' label and the ApplicationSet controller is healthy." >&2 exit 1 fi - echo "Application ${PARENT_APP} not ready (attempt ${i}/30): $get_err" + echo "Application ${PARENT_APP} not set-able (attempt ${i}/30): $set_err" sleep 10 done - echo "argocd app set ${PARENT_APP} ${set_args[*]}" - "$ARGOCD_BIN" app set "$PARENT_APP" --grpc-web "${set_args[@]}" - - post_pr_env_status "pending" "Env spinning up" - # A sync operation may be in flight — e.g. the parent's very first # automated sync, or a rollout from a previous commit stuck waiting on # an unhealthy wave. `app set` updates the spec but does NOT interrupt @@ -291,9 +238,4 @@ runs: # onto every child, so this one selector covers both. WAIT_SELECTOR="pr-env.mindsdb.com/anchor-repo=${OWN_SHORT},pr-env.mindsdb.com/pr-number=${PR_NUMBER}" echo "argocd app wait -l ${WAIT_SELECTOR} --sync --health --operation --timeout 1800" - if "$ARGOCD_BIN" app wait -l "$WAIT_SELECTOR" --grpc-web --sync --health --operation --timeout 1800; then - post_pr_env_status "success" "Ready" - else - post_pr_env_status "failure" "Env did not become Healthy within 30 minutes" - exit 1 - fi + "$ARGOCD_BIN" app wait -l "$WAIT_SELECTOR" --grpc-web --sync --health --operation --timeout 1800 From 4f6e9422f2f85011bf0975c6694109b9b67c34b4 Mon Sep 17 00:00:00 2001 From: Hamish Fagg Date: Fri, 19 Jun 2026 17:25:28 +1200 Subject: [PATCH 12/12] fix --- argocd-pr-env-deploy/action.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/argocd-pr-env-deploy/action.yml b/argocd-pr-env-deploy/action.yml index 873178e..edd106e 100644 --- a/argocd-pr-env-deploy/action.yml +++ b/argocd-pr-env-deploy/action.yml @@ -61,6 +61,12 @@ runs: OWN_REPO="$GITHUB_REPOSITORY" OWN_SHORT="${OWN_REPO##*/}" + # Slugified form for K8s resource names + labels. RFC 1123 forbids + # underscores in Application names / Namespaces / hostnames, but + # `mindshub_frontend` is the canonical GitHub repo name and stays + # underscored everywhere it identifies the repo (helm parameter + # keys, the `Deploys:` regex, ECR image tags). + OWN_SLUG="${OWN_SHORT//_/-}" PR_NUMBER="$(jq -r .pull_request.number "$GITHUB_EVENT_PATH")" # Two SHAs, used for two different things: @@ -76,7 +82,7 @@ runs: # branch deploy with the PR. OWN_MERGE_SHA="$GITHUB_SHA" OWN_HEAD_SHA="$(jq -r .pull_request.head.sha "$GITHUB_EVENT_PATH")" - PARENT_APP="pr-${OWN_SHORT}-${PR_NUMBER}" + PARENT_APP="pr-${OWN_SLUG}-${PR_NUMBER}" # GitHub REST helper — auth + sensible headers in one place. No `gh` # dependency: self-hosted runners may not have it. @@ -236,6 +242,6 @@ runs: # return immediately on the old pods' Healthy state. The label is # written by the AppSet template onto the parent and by `prenv.labels` # onto every child, so this one selector covers both. - WAIT_SELECTOR="pr-env.mindsdb.com/anchor-repo=${OWN_SHORT},pr-env.mindsdb.com/pr-number=${PR_NUMBER}" + WAIT_SELECTOR="pr-env.mindsdb.com/anchor-repo=${OWN_SLUG},pr-env.mindsdb.com/pr-number=${PR_NUMBER}" echo "argocd app wait -l ${WAIT_SELECTOR} --sync --health --operation --timeout 1800" "$ARGOCD_BIN" app wait -l "$WAIT_SELECTOR" --grpc-web --sync --health --operation --timeout 1800