From 41a482524618078e4a66ebfb91d0239c0dd0f4f0 Mon Sep 17 00:00:00 2001 From: Brian Reardon Date: Tue, 16 Jun 2026 18:55:20 -0700 Subject: [PATCH] distribute worker image multi-cell -> gallery replication, infisical /shared publish --- .github/workflows/build-worker-ami.yml | 52 ++++++++++++++++++-------- deploy/packer/worker-ami.pkr.hcl | 10 ++++- 2 files changed, 45 insertions(+), 17 deletions(-) diff --git a/.github/workflows/build-worker-ami.yml b/.github/workflows/build-worker-ami.yml index 2a8e5d1c..5b062d4a 100644 --- a/.github/workflows/build-worker-ami.yml +++ b/.github/workflows/build-worker-ami.yml @@ -46,6 +46,10 @@ env: AZURE_LOCATION: ${{ vars.AZURE_LOCATION || 'eastus2' }} AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} AZURE_GALLERY_NAME: ${{ vars.AZURE_GALLERY_NAME || 'opensandbox_gallery' }} + # JSON array of extra regions to replicate the gallery image to, beyond the + # build region — must list every cell region running workers. e.g. + # '["westus2","westus3"]'. Empty = build region only (single-cell). + AZURE_REPLICATION_REGIONS: ${{ vars.AZURE_REPLICATION_REGIONS || '[]' }} jobs: build-image: @@ -146,6 +150,7 @@ jobs: -var "subscription_id=$AZURE_SUBSCRIPTION_ID" \ -var "resource_group=$AZURE_RESOURCE_GROUP" \ -var "location=$AZURE_LOCATION" \ + -var "replication_regions=$AZURE_REPLICATION_REGIONS" \ -var "gallery_name=$AZURE_GALLERY_NAME" \ -var "image_version_patch=$PATCH" \ -var "base_archive_account=${{ secrets.AZURE_STORAGE_ACCOUNT }}" \ @@ -181,32 +186,47 @@ jobs: tenant-id: ${{ secrets.AZURE_TENANT_ID }} subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - - name: Store image ID in Key Vault + - name: Publish image coordinates to Infisical (/shared → all cell KVs) + env: + # Universal-auth machine identity with write access to /shared. + INFISICAL_PROJECT_ID: ${{ vars.INFISICAL_PROJECT_ID }} + INFISICAL_ENV: ${{ vars.INFISICAL_ENV || 'prod' }} + INFISICAL_UA_CLIENT_ID: ${{ secrets.INFISICAL_UA_CLIENT_ID }} + INFISICAL_UA_CLIENT_SECRET: ${{ secrets.INFISICAL_UA_CLIENT_SECRET }} run: | - az keyvault secret set \ - --vault-name "${{ secrets.AZURE_KEY_VAULT_NAME }}" \ - --name "worker-image-id" \ - --value "$IMAGE_ID" - - az keyvault secret set \ - --vault-name "${{ secrets.AZURE_KEY_VAULT_NAME }}" \ - --name "worker-image-version" \ - --value "$VERSION" + # The image coordinates are a single global fact (one gallery image + # version, identical for every cell), so they live in Infisical + # /shared. Writing directly to one Azure KV no longer works: Infisical + # is push-authoritative and its sync would overwrite the direct write + # with the stale /shared value. Publishing here lets Infisical fan the + # new image-id out to EVERY cell's KV, so every cell's scaler rolls. + curl -1sLf 'https://dl.cloudsmith.io/public/infisical/infisical-cli/setup.deb.sh' | sudo -E bash + sudo apt-get install -y infisical + + export INFISICAL_TOKEN=$(infisical login --method=universal-auth \ + --client-id="$INFISICAL_UA_CLIENT_ID" \ + --client-secret="$INFISICAL_UA_CLIENT_SECRET" \ + --silent --plain) + + set_shared() { + infisical secrets set --projectId="$INFISICAL_PROJECT_ID" --env="$INFISICAL_ENV" \ + --path=/shared --silent "$1=$2" >/dev/null + echo " /shared $1 = $2" + } + + set_shared worker-image-id "$IMAGE_ID" + set_shared worker-image-version "$VERSION" # Record this build's goldenVersion so the NEXT AMI build can # retain it as "previous golden" in /opt/opensandbox/images/bases/. NEW_GOLDEN=$(grep -a 'Base image golden version:' /tmp/packer-output.txt | tail -1 | awk '{print $NF}' | tr -d '\r') if [ -n "$NEW_GOLDEN" ]; then - az keyvault secret set \ - --vault-name "${{ secrets.AZURE_KEY_VAULT_NAME }}" \ - --name "golden-version" \ - --value "$NEW_GOLDEN" - echo "Recorded new golden-version: $NEW_GOLDEN" + set_shared golden-version "$NEW_GOLDEN" else echo "WARN: could not extract goldenVersion from packer output — next build won't retain this base" fi - echo "Published image ID to Key Vault" + echo "Published image coordinates to Infisical /shared — sync fans out to every cell KV" - name: Cleanup old images run: | diff --git a/deploy/packer/worker-ami.pkr.hcl b/deploy/packer/worker-ami.pkr.hcl index 0cfa53d0..ab443c4b 100644 --- a/deploy/packer/worker-ami.pkr.hcl +++ b/deploy/packer/worker-ami.pkr.hcl @@ -55,6 +55,12 @@ variable "location" { default = "westus2" } +variable "replication_regions" { + type = list(string) + description = "Extra Azure regions to replicate the gallery image version to, beyond the build region. Must include EVERY cell region that runs workers — a cell whose region has no replica cannot boot the image. The build region (var.location) is always included automatically." + default = [] +} + variable "vm_size" { type = string default = "Standard_D4ads_v7" @@ -184,7 +190,9 @@ source "azure-arm" "worker" { gallery_name = var.gallery_name image_name = "osb-worker-v7" image_version = "1.0.${var.image_version_patch}" - replication_regions = [var.location] + # Always include the build region; add every other cell region so each + # cell can boot this version from a local replica (cross-region boots fail). + replication_regions = distinct(concat([var.location], var.replication_regions)) } azure_tags = {