diff --git a/.github/workflows/manual-release.yml b/.github/workflows/manual-release.yml index 8cc012d3..880700ba 100644 --- a/.github/workflows/manual-release.yml +++ b/.github/workflows/manual-release.yml @@ -13,6 +13,8 @@ permissions: packages: write env: + GO_VERSION: '1.26' + MACOS_XCODE_VERSION: 'latest-stable' REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} @@ -39,9 +41,164 @@ jobs: exit 1 fi - release: + macos-release-sign-notarize: needs: validate-tag if: needs.validate-tag.outputs.tag-exists == 'true' + runs-on: macos-latest + outputs: + release-ready: ${{ steps.apple-secrets.outputs.ready }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.tag_version }} + fetch-depth: 0 + + - name: Check Apple release secrets + id: apple-secrets + shell: bash + env: + APPLE_TEAM_ID: ${{ secrets.APPLE_TEAM_ID }} + APPLE_DEV_ID_APPLICATION_CERT_BASE64: ${{ secrets.APPLE_DEV_ID_APPLICATION_CERT_BASE64 }} + APPLE_DEV_ID_APPLICATION_CERT_PASSWORD: ${{ secrets.APPLE_DEV_ID_APPLICATION_CERT_PASSWORD }} + APPLE_DEV_ID_INSTALLER_CERT_BASE64: ${{ secrets.APPLE_DEV_ID_INSTALLER_CERT_BASE64 }} + APPLE_DEV_ID_INSTALLER_CERT_PASSWORD: ${{ secrets.APPLE_DEV_ID_INSTALLER_CERT_PASSWORD }} + APPLE_KEYCHAIN_PASSWORD: ${{ secrets.APPLE_KEYCHAIN_PASSWORD }} + APPLE_NOTARYTOOL_KEY_ID: ${{ secrets.APPLE_NOTARYTOOL_KEY_ID }} + APPLE_NOTARYTOOL_ISSUER_ID: ${{ secrets.APPLE_NOTARYTOOL_ISSUER_ID }} + APPLE_NOTARYTOOL_KEY_P8_BASE64: ${{ secrets.APPLE_NOTARYTOOL_KEY_P8_BASE64 }} + run: | + missing=() + for var in \ + APPLE_TEAM_ID \ + APPLE_DEV_ID_APPLICATION_CERT_BASE64 \ + APPLE_DEV_ID_APPLICATION_CERT_PASSWORD \ + APPLE_DEV_ID_INSTALLER_CERT_BASE64 \ + APPLE_DEV_ID_INSTALLER_CERT_PASSWORD \ + APPLE_KEYCHAIN_PASSWORD \ + APPLE_NOTARYTOOL_KEY_ID \ + APPLE_NOTARYTOOL_ISSUER_ID \ + APPLE_NOTARYTOOL_KEY_P8_BASE64 + do + if [ -z "${!var}" ]; then + missing+=("$var") + fi + done + + if [ "${#missing[@]}" -gt 0 ]; then + echo "ready=false" >> "$GITHUB_OUTPUT" + { + echo "### macOS release packaging skipped" + echo + echo "Missing Apple release secrets:" + for item in "${missing[@]}"; do + echo "- $item" + done + } >> "$GITHUB_STEP_SUMMARY" + exit 0 + fi + + echo "ready=true" >> "$GITHUB_OUTPUT" + echo "### macOS release packaging enabled" >> "$GITHUB_STEP_SUMMARY" + + - name: Select Xcode + if: steps.apple-secrets.outputs.ready == 'true' + uses: maxim-lobanov/setup-xcode@v1 + with: + xcode-version: ${{ env.MACOS_XCODE_VERSION }} + + - name: Capture Xcode version + if: steps.apple-secrets.outputs.ready == 'true' + id: xcode + run: | + VERSION=$(xcodebuild -version | tr '\n' ' ' | sed 's/ */ /g') + echo "version=$VERSION" >> $GITHUB_OUTPUT + xcodebuild -version + + - name: Install XcodeGen + if: steps.apple-secrets.outputs.ready == 'true' + run: | + brew list xcodegen >/dev/null 2>&1 || brew install xcodegen + xcodegen version + + - name: Set up Go + if: steps.apple-secrets.outputs.ready == 'true' + uses: actions/setup-go@v5 + with: + go-version: ${{ env.GO_VERSION }} + cache: true + + - name: Cache macOS DerivedData + if: steps.apple-secrets.outputs.ready == 'true' + uses: actions/cache@v4 + with: + path: .build/macos/DerivedData + key: macos-release-deriveddata-${{ runner.os }}-${{ steps.xcode.outputs.version }}-${{ hashFiles('proto/localbroker/**/*.proto', 'buf*.yaml', 'platform/macos/PrivilegeServices/project.yml', 'platform/macos/PrivilegeServices/**/*.swift', 'platform/macos/PrivilegeServices/**/*.plist', 'platform/macos/PrivilegeServices/**/*.entitlements', 'platform/macos/PrivilegeServices/**/*.template', 'scripts/*macos-privilege-services*.sh', 'scripts/localbroker-codegen-common.sh', 'Makefile') }} + restore-keys: | + macos-release-deriveddata-${{ runner.os }}-${{ steps.xcode.outputs.version }}- + + - name: Import Developer ID certificates + if: steps.apple-secrets.outputs.ready == 'true' + shell: bash + env: + APPLE_KEYCHAIN_PASSWORD: ${{ secrets.APPLE_KEYCHAIN_PASSWORD }} + APPLE_DEV_ID_APPLICATION_CERT_BASE64: ${{ secrets.APPLE_DEV_ID_APPLICATION_CERT_BASE64 }} + APPLE_DEV_ID_APPLICATION_CERT_PASSWORD: ${{ secrets.APPLE_DEV_ID_APPLICATION_CERT_PASSWORD }} + APPLE_DEV_ID_INSTALLER_CERT_BASE64: ${{ secrets.APPLE_DEV_ID_INSTALLER_CERT_BASE64 }} + APPLE_DEV_ID_INSTALLER_CERT_PASSWORD: ${{ secrets.APPLE_DEV_ID_INSTALLER_CERT_PASSWORD }} + run: | + KEYCHAIN_PATH="$RUNNER_TEMP/thand-build.keychain-db" + APP_CERT_PATH="$RUNNER_TEMP/dev-id-application.p12" + INSTALLER_CERT_PATH="$RUNNER_TEMP/dev-id-installer.p12" + + echo "$APPLE_DEV_ID_APPLICATION_CERT_BASE64" | base64 --decode > "$APP_CERT_PATH" + echo "$APPLE_DEV_ID_INSTALLER_CERT_BASE64" | base64 --decode > "$INSTALLER_CERT_PATH" + + security create-keychain -p "$APPLE_KEYCHAIN_PASSWORD" "$KEYCHAIN_PATH" + security set-keychain-settings -lut 21600 "$KEYCHAIN_PATH" + security unlock-keychain -p "$APPLE_KEYCHAIN_PASSWORD" "$KEYCHAIN_PATH" + security import "$APP_CERT_PATH" -k "$KEYCHAIN_PATH" -P "$APPLE_DEV_ID_APPLICATION_CERT_PASSWORD" -T /usr/bin/codesign -T /usr/bin/security + security import "$INSTALLER_CERT_PATH" -k "$KEYCHAIN_PATH" -P "$APPLE_DEV_ID_INSTALLER_CERT_PASSWORD" -T /usr/bin/productsign -T /usr/bin/security + security list-keychains -d user -s "$KEYCHAIN_PATH" login.keychain-db + security default-keychain -d user -s "$KEYCHAIN_PATH" + security set-key-partition-list -S apple-tool:,apple: -s -k "$APPLE_KEYCHAIN_PASSWORD" "$KEYCHAIN_PATH" + + - name: Configure notarytool profile + if: steps.apple-secrets.outputs.ready == 'true' + shell: bash + env: + APPLE_NOTARYTOOL_KEY_ID: ${{ secrets.APPLE_NOTARYTOOL_KEY_ID }} + APPLE_NOTARYTOOL_ISSUER_ID: ${{ secrets.APPLE_NOTARYTOOL_ISSUER_ID }} + APPLE_NOTARYTOOL_KEY_P8_BASE64: ${{ secrets.APPLE_NOTARYTOOL_KEY_P8_BASE64 }} + run: | + KEY_PATH="$RUNNER_TEMP/AuthKey_${APPLE_NOTARYTOOL_KEY_ID}.p8" + echo "$APPLE_NOTARYTOOL_KEY_P8_BASE64" | base64 --decode > "$KEY_PATH" + xcrun notarytool store-credentials thand-ci-notary \ + --key "$KEY_PATH" \ + --key-id "$APPLE_NOTARYTOOL_KEY_ID" \ + --issuer "$APPLE_NOTARYTOOL_ISSUER_ID" + echo "NOTARYTOOL_PROFILE=thand-ci-notary" >> "$GITHUB_ENV" + + - name: Package signed macOS installer + if: steps.apple-secrets.outputs.ready == 'true' + env: + APPLE_TEAM_ID: ${{ secrets.APPLE_TEAM_ID }} + run: | + PACKAGE_VERSION="${{ github.event.inputs.tag_version }}" + PACKAGE_VERSION="${PACKAGE_VERSION#v}" + export PACKAGE_VERSION + ./scripts/package-macos-privilege-services-release.sh + + - name: Upload macOS installer artifact + if: steps.apple-secrets.outputs.ready == 'true' + uses: actions/upload-artifact@v4 + with: + name: thand-macos-privilege-services-pkg + path: .build/macos/PrivilegeServices/release/ThandPrivilegeServices.pkg + + release: + needs: [validate-tag, macos-release-sign-notarize] + if: needs.validate-tag.outputs.tag-exists == 'true' runs-on: ubuntu-latest steps: - name: Checkout code @@ -94,6 +251,13 @@ jobs: repository: ${{ github.repository }} run-id: ${{ steps.find-run.outputs.run-id }} + - name: Download macOS installer artifact + if: needs.macos-release-sign-notarize.outputs.release-ready == 'true' + uses: actions/download-artifact@v4 + with: + name: thand-macos-privilege-services-pkg + path: dist/ + - name: Generate changelog id: changelog run: | diff --git a/.github/workflows/test-and-build.yml b/.github/workflows/test-and-build.yml index f5e5deb6..4506aa75 100644 --- a/.github/workflows/test-and-build.yml +++ b/.github/workflows/test-and-build.yml @@ -26,6 +26,7 @@ permissions: env: GO_VERSION: '1.26' GOEXPERIMENT: 'jsonv2' + MACOS_XCODE_VERSION: 'latest-stable' REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} @@ -49,6 +50,7 @@ jobs: - name: Run unit tests run: | + make gen-buf # Run all tests except functional and integration tests go test -v -short $(go list ./... | grep -v '/test/functional' | grep -v '/test/integration') @@ -73,7 +75,9 @@ jobs: run: go install go.temporal.io/sdk/contrib/tools/workflowcheck@v0.4.0 - name: Run workflowcheck - run: workflowcheck -test=false -config workflowcheck.yaml ./internal/... ./sdk/... + run: | + make gen-buf + workflowcheck -test=false -config workflowcheck.yaml ./internal/... ./sdk/... functional-tests: runs-on: ubuntu-latest @@ -94,6 +98,7 @@ jobs: - name: Run functional tests run: | + make gen-buf # Run functional tests (these use testcontainers/Docker) # Docker is available by default on ubuntu-latest runners cd test && go test -v -timeout 10m ./functional/... @@ -116,7 +121,9 @@ jobs: cache: true - name: Run services integration tests - run: cd test && go test -v -timeout 5m ./integration/services/... + run: | + make gen-buf + cd test && go test -v -timeout 5m ./integration/services/... integration-workflows: runs-on: ubuntu-latest @@ -136,7 +143,9 @@ jobs: cache: true - name: Run workflows integration tests - run: cd test && go test -v -timeout 15m ./integration/workflows/... + run: | + make gen-buf + cd test && go test -v -timeout 15m ./integration/workflows/... build-linux-amd64: runs-on: ubuntu-latest @@ -165,6 +174,57 @@ jobs: path: bin/thand-linux-amd64 retention-days: 1 + macos-validation: + runs-on: macos-latest + if: github.event_name == 'pull_request' || (github.event_name == 'push' && github.ref == 'refs/heads/main') + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 1 + submodules: recursive + + - name: Select Xcode + uses: maxim-lobanov/setup-xcode@v1 + with: + xcode-version: ${{ env.MACOS_XCODE_VERSION }} + + - name: Capture Xcode version + id: xcode + run: | + VERSION=$(xcodebuild -version | tr '\n' ' ' | sed 's/ */ /g') + echo "version=$VERSION" >> $GITHUB_OUTPUT + xcodebuild -version + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: ${{ env.GO_VERSION }} + cache: true + + - name: Install XcodeGen + run: | + brew list xcodegen >/dev/null 2>&1 || brew install xcodegen + xcodegen version + + - name: Cache macOS DerivedData + uses: actions/cache@v4 + with: + path: .build/macos/DerivedData + key: macos-deriveddata-${{ runner.os }}-${{ steps.xcode.outputs.version }}-${{ hashFiles('proto/localbroker/**/*.proto', 'buf*.yaml', 'platform/macos/PrivilegeServices/project.yml', 'platform/macos/PrivilegeServices/**/*.swift', 'platform/macos/PrivilegeServices/**/*.plist', 'platform/macos/PrivilegeServices/**/*.entitlements', 'platform/macos/PrivilegeServices/**/*.template', 'scripts/*macos-privilege-services*.sh', 'scripts/localbroker-codegen-common.sh', 'Makefile') }} + restore-keys: | + macos-deriveddata-${{ runner.os }}-${{ steps.xcode.outputs.version }}- + + - name: Build + run: make build + + - name: Test + run: make test + + - name: Verify unsigned package layout + run: THAND_MACOS_SKIP_SIGNING=1 make package-macos-privilege-services-dev + integration-frontend: runs-on: ubuntu-latest if: github.event_name == 'pull_request' || (github.event_name == 'push' && github.ref == 'refs/heads/main') @@ -198,7 +258,9 @@ jobs: sudo apt-get install -y chromium-browser - name: Run frontend E2E integration tests - run: cd test && go test -v -timeout 30m ./integration/frontend/... + run: | + make gen-buf + cd test && go test -v -timeout 30m ./integration/frontend/... env: DISPLAY: :99 CHROME_BIN: /usr/bin/chromium-browser @@ -295,6 +357,7 @@ jobs: - name: Build for ${{ matrix.goos }}/${{ matrix.goarch }} run: | + make gen-buf VERSION=${{ steps.version.outputs.version }} COMMIT=${{ steps.version.outputs.commit }} GOOS=${{ matrix.goos }} diff --git a/.gitignore b/.gitignore index 194f163f..55601fef 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,7 @@ go.work.sum /bin/ /dist/ /build/ +/.build/ # Temporary files *.tmp @@ -105,3 +106,8 @@ config/workflows/*.json config/temporal/*.key config/temporal/*.pem config/README.md + +# Generated protobuf/gRPC sources +internal/localbroker/proto/localbroker/v1/*.pb.go +platform/macos/PrivilegeServices/Generated/LocalBroker/*.swift +platform/macos/PrivilegeServices/ThandPrivilegeServices.xcodeproj/ diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md new file mode 100644 index 00000000..5f138856 --- /dev/null +++ b/DEVELOPMENT.md @@ -0,0 +1,242 @@ +# Development Notes + +This file documents the supported developer loop for repo features that need local setup beyond normal Go build and test commands. + +## macOS Privilege Services + +The macOS timed-sudo path now lives under `platform/macos/PrivilegeServices`. + +### Scope + +- `xcodebuild` is the only supported native macOS build and test backend. +- XcodeGen is the source of truth for the generated Xcode project. +- `make build` and `make test` remain the top-level commands on Darwin. +- The Go agent stays unprivileged. +- The native app bundle owns `SMAppService` registration, the login item, and the privileged broker daemon. +- `brokerctl` remains installed at `/Library/Application Support/Thand/PrivilegeBroker/bin/thand-macos-privilege-brokerctl`. +- the installed app bundle and `brokerctl` payload are normalized to `root:wheel` ownership and non-user-writable modes during dev install + +### Prerequisites + +- macOS with full Xcode installed, not Command Line Tools only. +- Xcode selected via `xcode-select`. +- `xcodegen` installed locally: + +```bash +brew install xcodegen +sudo xcode-select -s /Applications/Xcode.app/Contents/Developer +``` + +- Root access for install and end-to-end broker testing. +- For full local `SMAppService` integration testing: + - Xcode signed into an Apple team that can perform Apple Development signing + - `APPLE_TEAM_ID` exported in your shell as the signing TeamIdentifier + - an Apple Development signing identity that shows up as valid in the login keychain + - if your Apple Development certificate is present but marked untrusted, install the Apple Worldwide Developer Relations intermediate certificate for the current Apple Development chain (`WWDR G3`) from Apple PKI into the `System` keychain and leave trust at the default system settings + +Example: + +```bash +export APPLE_TEAM_ID=ABCDE12345 +security find-identity -v -p codesigning ~/Library/Keychains/login.keychain-db +``` + +That identity check should show at least one valid Apple Development identity for your team. `APPLE_TEAM_ID` should be the signing TeamIdentifier from the certificate subject `OU`, not the display-name suffix in the certificate common name if those differ on your machine. If the identity check still says `0 valid identities found`, install or repair the WWDR G3 intermediate and re-check before attempting the signed dev install. The `0 provisioned devices` indicator in Xcode account settings is not relevant for this macOS flow. + +To install the WWDR G3 intermediate on the local Mac: + +1. Download the current Apple Worldwide Developer Relations intermediate certificate for the Apple Development chain (`WWDR G3`) from Apple PKI. +2. Open the downloaded certificate in Keychain Access. +3. When prompted for a keychain, install it into the `System` keychain. +4. Leave the trust settings at `Use System Defaults`. +5. Re-run the identity check above and confirm it now shows at least one valid Apple Development identity. + +### Supported Build Outputs + +- Generated project: + - `platform/macos/PrivilegeServices/ThandPrivilegeServices.xcodeproj` +- Generated protobuf/gRPC stubs: + - `internal/localbroker/proto/localbroker/v1/*.pb.go` + - `platform/macos/PrivilegeServices/Generated/LocalBroker/*.swift` +- Repo-local native build cache: + - `.build/macos/DerivedData` +- Repo-local staging and packaging outputs: + - `.build/macos/PrivilegeServices` + +`make clean` removes those generated native build outputs. + +The checked-in protobuf contract lives in `proto/localbroker/v1/localbroker.proto`. Generated Go and Swift stubs are intentionally ignored. Use `make gen` or `make gen-buf` to regenerate them explicitly; supported build, test, package, and install targets run that generation before compiling. + +### Fast Local Loop + +Use this loop for normal development. It does not require Apple release secrets or notarization. + +```bash +make build +make test +``` + +On Darwin, those commands do all of the following: + +- build the Go agent +- generate localbroker protobuf/gRPC stubs +- Apple Development-sign `bin/thand` when `APPLE_TEAM_ID` is set for strict local broker testing +- generate the Xcode project from `platform/macos/PrivilegeServices/project.yml` +- run `xcodebuild build` +- run `xcodebuild test` + +The Make targets use coarse repo-local stamp files around the native build and test outputs, while Xcode owns the real Swift-level incrementality underneath. Re-running `make build` or `make test` with no relevant file changes should be a no-op. + +For layout-only packaging verification without local signing: + +```bash +THAND_MACOS_SKIP_SIGNING=1 make package-macos-privilege-services-dev +``` + +That verifies the staged bundle structure and the installed `brokerctl` payload location, but it is not a supported broker runtime path and is not sufficient for real `SMAppService` integration. + +### Full Local Integration Loop + +Use this loop when you need to test the real login item, daemon registration, notifications, brokered sudo, reboot handling, and reconnect behavior on your own Mac. + +1. Export your Apple team ID: + +```bash +export APPLE_TEAM_ID=ABCDE12345 +``` + +2. Confirm your Apple Development identity is trusted locally: + +```bash +security find-identity -v -p codesigning ~/Library/Keychains/login.keychain-db +``` + +If you see `0 valid identities found` but your Apple Development certificate exists in Keychain Access, install the current Apple WWDR G3 intermediate certificate into the `System` keychain with default trust settings, then retry the check. + +3. Build and run native tests: + +```bash +make build +make test +``` + +When `APPLE_TEAM_ID` is set, `make build` also Apple Development-signs the local `bin/thand` binary with the `io.thand.agent` identifier so the installed helper can verify the parent agent identity during strict local broker testing. + +4. Package and install the Apple Development-signed dev payload: + +```bash +sudo -E make install-macos-privilege-services-dev +``` + +The install target intentionally splits responsibilities: + +- packaging and Apple Development signing happen as the invoking desktop user so the user keychain identities are available +- copying into `/Applications` and `/Library/Application Support/...` happens as root +- the installed app bundle, embedded helpers, daemon plist, and installed `brokerctl` are normalized to `root:wheel` ownership with non-user-writable modes +- the installed app is asked to `register` after install when `SUDO_USER` is available +- the staged and installed daemon keep strict peer entitlement enforcement enabled +- the staged and installed helper binaries carry the same broker peer entitlements as the release packaging path + +5. Open the installed app registration manually if needed: + +```bash +/Applications/ThandPrivilegeServices.app/Contents/MacOS/ThandPrivilegeServices register +``` + +6. Approve background items and the daemon in System Settings when macOS prompts for them. + +7. Submit and verify a timed sudo request: + +```bash +thand request sudo --device device-alpha --duration 5m --reason "privilege services smoke test" +``` + +Validate all of the following locally: + +- grant succeeds and returns broker-backed metadata +- notifier shows grant, revoke, and expiry notifications +- sudoers fragment appears under `/etc/sudoers.d` +- lease state appears under `/var/db/thand/local-privilege-broker` +- revoke removes the active grant +- expiry still happens if the agent disconnects +- reboot and reconnect reconciliation still converge +- direct helper execution outside the signed agent path is still rejected + +### Useful Commands + +Show the selected Xcode: + +```bash +xcodebuild -version +xcode-select -p +``` + +Check the locally trusted Apple Development signing identities: + +```bash +security find-identity -v -p codesigning ~/Library/Keychains/login.keychain-db +``` + +Open System Settings to the login-items area: + +```bash +/Applications/ThandPrivilegeServices.app/Contents/MacOS/ThandPrivilegeServices open-settings +``` + +Check current native registration status: + +```bash +/Applications/ThandPrivilegeServices.app/Contents/MacOS/ThandPrivilegeServices status +``` + +Tail the broker log: + +```bash +sudo tail -f /var/log/thand-privilege-broker.log +``` + +### Release Packaging + +Public-distribution packaging is separate from local development: + +- local integration uses Apple Development signing +- release packaging uses Developer ID signing and notarization + +Build the release installer locally only if you have the required Developer ID identities and notarization setup: + +```bash +export APPLE_TEAM_ID=ABCDE12345 +./scripts/package-macos-privilege-services-release.sh +``` + +That produces a signed `.pkg` under `.build/macos/PrivilegeServices/release`. + +### Cleanup + +Remove the locally installed dev payload with: + +```bash +sudo make uninstall-macos-privilege-services-dev +``` + +### CI Requirements + +The supported macOS CI path assumes: + +- a GitHub macOS runner with full Xcode available +- Xcode selected explicitly in the workflow +- XcodeGen installed in the workflow before `make build` and `make test` +- Go available so the repo-local buf and protobuf codegen tools can be bootstrapped into `.build` + +PR and mainline validation do not require Apple signing secrets. They run unsigned native build, test, and layout verification only. + +The optional macOS release-sign/notarize lane is separate: + +- it skips cleanly when Apple release secrets are absent +- once those secrets exist, it signs and notarizes the public `.pkg` + +### Unsupported Old Paths + +The old `platform/macos/PrivilegeBroker` SwiftPM-first layout is no longer the supported developer path. + +`swift build` and `swift test` are no longer the supported native macOS workflow for this feature. diff --git a/Makefile b/Makefile index b73b3abc..e41d52c5 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,12 @@ BUILD_DIR=bin UPX_FLAGS ?= --best --lzma --force-macos GO_BUILD_FLAGS= -ldflags "-s -w" +HOST_OS := $(shell uname -s) +PRIVILEGE_SERVICES_FILES := $(shell find platform/macos/PrivilegeServices -type f ! -path '*/ThandPrivilegeServices.xcodeproj/*' 2>/dev/null) +LOCALBROKER_PROTO_FILES := $(shell find proto/localbroker -type f 2>/dev/null) +LOCALBROKER_CODEGEN_FILES := buf.yaml buf.gen.go.yaml buf.gen.swift.yaml scripts/generate-localbroker-grpc.sh scripts/localbroker-codegen-common.sh scripts/macos-privilege-services-common.sh +PRIVILEGE_SERVICES_BUILD_STAMP := .build/macos/PrivilegeServices/.build-stamp +PRIVILEGE_SERVICES_TEST_STAMP := .build/macos/PrivilegeServices/.test-stamp # Default target - builds the application all: build @@ -21,24 +27,68 @@ submodules: update-submodules: git submodule update --remote --recursive +# Generate derived sources required by Go and native macOS builds. +gen: gen-buf + +gen-buf: $(LOCALBROKER_PROTO_FILES) $(LOCALBROKER_CODEGEN_FILES) + ./scripts/generate-localbroker-grpc.sh + # Build the application -build: submodules +build: submodules gen-buf go build -o $(BUILD_DIR)/$(BINARY_NAME) . +ifeq ($(HOST_OS),Darwin) + ./scripts/sign-macos-local-agent.sh $(BUILD_DIR)/$(BINARY_NAME) +endif +ifeq ($(HOST_OS),Darwin) +build: build-macos-privilege-services +endif # Build for multiple platforms -build-all: submodules +build-all: submodules gen-buf GOOS=linux GOARCH=amd64 GOEXPERIMENT=jsonv2 go build $(GO_BUILD_FLAGS) -o $(BUILD_DIR)/$(BINARY_NAME)-linux-amd64 . GOOS=linux GOARCH=arm64 GOEXPERIMENT=jsonv2 go build $(GO_BUILD_FLAGS) -o $(BUILD_DIR)/$(BINARY_NAME)-linux-arm64 . GOOS=darwin GOARCH=amd64 GOEXPERIMENT=jsonv2 go build $(GO_BUILD_FLAGS) -o $(BUILD_DIR)/$(BINARY_NAME)-darwin-amd64 . GOOS=windows GOARCH=amd64 GOEXPERIMENT=jsonv2 go build $(GO_BUILD_FLAGS) -o $(BUILD_DIR)/$(BINARY_NAME)-windows-amd64.exe . # Build linux/amd64 binary for frontend E2E tests -build-linux-amd64: submodules +build-linux-amd64: submodules gen-buf CGO_ENABLED=0 GOOS=linux GOARCH=amd64 GOEXPERIMENT=jsonv2 go build $(GO_BUILD_FLAGS) -o $(BUILD_DIR)/$(BINARY_NAME)-linux-amd64 . +# Build the macOS native privilege services app, login item, daemon, and brokerctl. +build-macos-privilege-services: $(PRIVILEGE_SERVICES_BUILD_STAMP) + +$(PRIVILEGE_SERVICES_BUILD_STAMP): $(PRIVILEGE_SERVICES_FILES) $(LOCALBROKER_PROTO_FILES) $(LOCALBROKER_CODEGEN_FILES) scripts/build-macos-privilege-services.sh + ./scripts/build-macos-privilege-services.sh + @mkdir -p $(dir $@) + @touch $@ + +# Run the macOS native privilege services test suite. +test-macos-privilege-services: $(PRIVILEGE_SERVICES_TEST_STAMP) + +$(PRIVILEGE_SERVICES_TEST_STAMP): $(PRIVILEGE_SERVICES_FILES) $(LOCALBROKER_PROTO_FILES) $(LOCALBROKER_CODEGEN_FILES) scripts/test-macos-privilege-services.sh + ./scripts/test-macos-privilege-services.sh + @mkdir -p $(dir $@) + @touch $@ + +# Package a development payload for local Apple Development-signed integration testing. +package-macos-privilege-services-dev: + ./scripts/package-macos-privilege-services-dev.sh + +# Install the packaged development payload for local end-to-end testing. +install-macos-privilege-services-dev: + ./scripts/install-macos-privilege-services-dev.sh + +# Remove the installed development payload from a local machine. +uninstall-macos-privilege-services-dev: + ./scripts/uninstall-macos-privilege-services-dev.sh + +# Produce the signed macOS release installer when Developer ID material is available. +package-macos-privilege-services-release: + ./scripts/package-macos-privilege-services-release.sh + # Clean build artifacts clean: - rm -rf $(BUILD_DIR) + rm -rf $(BUILD_DIR) .build internal/localbroker/proto/localbroker/v1/*.pb.go platform/macos/PrivilegeServices/Generated/LocalBroker platform/macos/PrivilegeServices/ThandPrivilegeServices.xcodeproj # Manually compress any binaries in $(BUILD_DIR) using UPX compress: @@ -50,27 +100,30 @@ compress: fi # Install the binary to GOPATH/bin -install: submodules +install: submodules gen-buf go install . # Run the application -run: submodules +run: submodules gen-buf go run . # Run tests -test: submodules +test: submodules gen-buf go test ./... +ifeq ($(HOST_OS),Darwin) +test: test-macos-privilege-services +endif # Run functional tests -test-functional: submodules +test-functional: submodules gen-buf cd test && go test -v ./functional/... # Run integration tests -test-integration: submodules +test-integration: submodules gen-buf cd test && go test -v ./integration/... # Run UI E2E tests (requires linux/amd64 binary for Thand server container) -test-e2e: submodules build-linux-amd64 +test-e2e: submodules gen-buf build-linux-amd64 cd test && go test -v -timeout 20m ./integration/frontend/... # Generate FlatBuffers from JSON data @@ -110,4 +163,4 @@ swagger: exit 1; \ fi -.PHONY: all build build-all build-linux-amd64 clean install run test test-functional test-integration test-e2e submodules update-submodules compress generate-data swagger workflowcheck +.PHONY: all build build-all build-linux-amd64 build-macos-privilege-services test-macos-privilege-services package-macos-privilege-services-dev install-macos-privilege-services-dev uninstall-macos-privilege-services-dev package-macos-privilege-services-release clean install run test test-functional test-integration test-e2e submodules update-submodules gen gen-buf compress generate-data swagger workflowcheck diff --git a/buf.gen.go.yaml b/buf.gen.go.yaml new file mode 100644 index 00000000..f7485f45 --- /dev/null +++ b/buf.gen.go.yaml @@ -0,0 +1,10 @@ +version: v2 +plugins: + - local: protoc-gen-go + out: internal/localbroker/proto + opt: + - paths=source_relative + - local: protoc-gen-go-grpc + out: internal/localbroker/proto + opt: + - paths=source_relative diff --git a/buf.gen.swift.yaml b/buf.gen.swift.yaml new file mode 100644 index 00000000..76e01613 --- /dev/null +++ b/buf.gen.swift.yaml @@ -0,0 +1,14 @@ +version: v2 +plugins: + - local: protoc-gen-swift + out: platform/macos/PrivilegeServices/Generated/LocalBroker + opt: + - Visibility=Public + - FileNaming=DropPath + - local: protoc-gen-grpc-swift-2 + out: platform/macos/PrivilegeServices/Generated/LocalBroker + opt: + - Visibility=Public + - FileNaming=DropPath + - Client=true + - Server=true diff --git a/buf.yaml b/buf.yaml new file mode 100644 index 00000000..e3284fe1 --- /dev/null +++ b/buf.yaml @@ -0,0 +1,6 @@ +version: v2 +modules: + - path: proto +lint: + use: + - STANDARD diff --git a/cmd/cli/README.md b/cmd/cli/README.md index 76eef638..11176c61 100644 --- a/cmd/cli/README.md +++ b/cmd/cli/README.md @@ -118,6 +118,16 @@ Shows current configuration including: - Login endpoint - Logging level +#### `thand config device-id` +Print the canonical device ID for the current machine. + +**Usage:** +```bash +thand config device-id +``` + +This prints the effective `device_id` only, with no extra label text, so it can be copied directly into device configuration. + #### `thand roles` List available roles from the remote login server. diff --git a/cmd/cli/config_device_id.go b/cmd/cli/config_device_id.go new file mode 100644 index 00000000..708eed1e --- /dev/null +++ b/cmd/cli/config_device_id.go @@ -0,0 +1,24 @@ +package cli + +import ( + "fmt" + + "github.com/spf13/cobra" + "github.com/thand-io/agent/internal/common" +) + +var configDeviceIDCmd = &cobra.Command{ + Use: "device-id", + Short: "Print the effective device ID for this machine", + Args: cobra.NoArgs, + SilenceUsage: true, + SilenceErrors: true, + RunE: func(cmd *cobra.Command, _ []string) error { + _, err := fmt.Fprintln(cmd.OutOrStdout(), common.GetDeviceID().String()) + return err + }, +} + +func init() { + configCmd.AddCommand(configDeviceIDCmd) +} diff --git a/cmd/cli/config_device_id_test.go b/cmd/cli/config_device_id_test.go new file mode 100644 index 00000000..ac73489e --- /dev/null +++ b/cmd/cli/config_device_id_test.go @@ -0,0 +1,46 @@ +package cli + +import ( + "bytes" + "strings" + "testing" + + "github.com/spf13/cobra" + "github.com/thand-io/agent/internal/common" +) + +func TestConfigDeviceIDCommandPrintsEffectiveDeviceID(t *testing.T) { + cmd := &cobra.Command{} + var out bytes.Buffer + cmd.SetOut(&out) + + if err := configDeviceIDCmd.RunE(cmd, nil); err != nil { + t.Fatalf("RunE returned error: %v", err) + } + + got := strings.TrimSpace(out.String()) + want := common.GetDeviceID().String() + + if got != want { + t.Fatalf("printed device ID = %q, want %q", got, want) + } +} + +func TestConfigDeviceIDCommandWritesOnlyToStdout(t *testing.T) { + cmd := &cobra.Command{} + var out bytes.Buffer + var stderr bytes.Buffer + cmd.SetOut(&out) + cmd.SetErr(&stderr) + + if err := configDeviceIDCmd.RunE(cmd, nil); err != nil { + t.Fatalf("RunE returned error: %v", err) + } + + if stderr.Len() != 0 { + t.Fatalf("stderr = %q, want empty", stderr.String()) + } + if strings.TrimSpace(out.String()) == "" { + t.Fatal("stdout was empty") + } +} diff --git a/cmd/cli/main.go b/cmd/cli/main.go index 16f17934..40ad8053 100644 --- a/cmd/cli/main.go +++ b/cmd/cli/main.go @@ -1,6 +1,7 @@ package cli import ( + "context" "errors" "fmt" "os" @@ -120,6 +121,17 @@ func preRunConfigE(cmd *cobra.Command, mode config.Mode) error { case config.ModeAgent: + // Materialize provider, role, and workflow definitions from the current + // configured sources (path/url/vault/defaults) before provider + // initialization. The agent bootstrap path currently only returns + // services/device state from the login server, so InitializeProviders + // still depends on this local definition load. + err = cfg.ReloadConfig() + if err != nil { + logrus.WithError(err).Errorln("Failed to load local agent configuration") + return fmt.Errorf("failed to load local agent configuration: %w", err) + } + // Initialize providers err = cfg.InitializeProviders() @@ -128,6 +140,11 @@ func preRunConfigE(cmd *cobra.Command, mode config.Mode) error { return err } + isDefaultLoginEndpoint := cfg.GetLoginServerUrl() == common.DefaultLoginServerEndpoint + if cfg.HasLoginServer() && !isDefaultLoginEndpoint { + go cfg.RunDeviceBootstrap(context.Background()) + } + case config.ModeServer: // Load local config first before registering with the thand server. diff --git a/cmd/cli/sudo.go b/cmd/cli/sudo.go new file mode 100644 index 00000000..f43ff38f --- /dev/null +++ b/cmd/cli/sudo.go @@ -0,0 +1,82 @@ +package cli + +import ( + "fmt" + "runtime" + + "github.com/spf13/cobra" + "github.com/thand-io/agent/internal/common" + "github.com/thand-io/agent/internal/models" +) + +var sudoCmd = &cobra.Command{ + Use: "sudo [command...]", + Short: "Request local sudo access or run a privileged command", + Long: `Request time-bound local sudo access or run a single privileged command through the local provider.`, + PreRunE: preRunClientConfigWithSessionE, + RunE: func(cmd *cobra.Command, args []string) error { + reason, _ := cmd.Flags().GetString("reason") + duration, _ := cmd.Flags().GetString("duration") + device, _ := cmd.Flags().GetString("device") + if !cmd.Flags().Changed("device") { + device = common.GetDeviceID().String() + } + request, err := buildLocalSudoElevationRequest(args, reason, duration, device) + if err != nil { + return err + } + + return MakeElevationRequest(request) + }, +} + +func buildLocalSudoElevationRequest(args []string, reason, duration, device string) (*models.ElevateRequest, error) { + return buildLocalSudoElevationRequestForOS(runtime.GOOS, args, reason, duration, device) +} + +func buildLocalSudoElevationRequestForOS(goos string, args []string, reason, duration, device string) (*models.ElevateRequest, error) { + if len(reason) == 0 { + return nil, fmt.Errorf("--reason is required") + } + if cfg == nil { + return nil, fmt.Errorf("configuration is not loaded") + } + + metadata := models.LocalSudoRequestMetadata{ + Mode: models.LocalSudoModeTimed, + } + + if len(args) > 0 { + if goos == "darwin" { + return nil, fmt.Errorf("privileged command mode is not supported on macOS in broker v1; request timed sudo access instead") + } + metadata.Mode = models.LocalSudoModeCommand + metadata.Command = append([]string(nil), args...) + } + + role, err := cfg.GetRoleByName(models.LocalSudoRoleIdentifier) + if err != nil { + return nil, fmt.Errorf("local sudo role %q is not configured: %w", models.LocalSudoRoleIdentifier, err) + } + + request := &models.ElevateRequest{ + Role: models.CloneRole(role), + Device: device, + Reason: reason, + Duration: duration, + Metadata: metadata.AsMap(), + } + if err := models.NormalizeLocalSudoRequest(request, cfg.GetProviders().Definitions); err != nil { + return nil, err + } + + return request, nil +} + +func init() { + requestCmd.AddCommand(sudoCmd) + + sudoCmd.Flags().StringP("duration", "d", "", "Duration of timed sudo access (for example 30m or 1h)") + sudoCmd.Flags().StringP("reason", "e", "", "Reason for the sudo request") + sudoCmd.Flags().String("device", "", "Canonical device_id for local sudo execution (defaults to the current device when omitted)") +} diff --git a/cmd/cli/sudo_test.go b/cmd/cli/sudo_test.go new file mode 100644 index 00000000..2a9b4daa --- /dev/null +++ b/cmd/cli/sudo_test.go @@ -0,0 +1,248 @@ +package cli + +import ( + "testing" + + "github.com/spf13/cobra" + "github.com/thand-io/agent/internal/common" + configpkg "github.com/thand-io/agent/internal/config" + "github.com/thand-io/agent/internal/models" +) + +func TestBuildLocalSudoElevationRequestTimed(t *testing.T) { + previousCfg := cfg + t.Cleanup(func() { cfg = previousCfg }) + + cfg = newTestSudoConfig("local-custom") + + request, err := buildLocalSudoElevationRequest(nil, "system maintenance", "30m", "device-alpha") + if err != nil { + t.Fatalf("buildLocalSudoElevationRequest returned error: %v", err) + } + + if got, want := request.Workflow, models.LocalSudoTimedWorkflowName; got != want { + t.Fatalf("workflow = %q, want %q", got, want) + } + if got, want := request.Duration, "30m"; got != want { + t.Fatalf("duration = %q, want %q", got, want) + } + if got, want := request.Providers[0], "local-custom"; got != want { + t.Fatalf("provider = %q, want %q", got, want) + } + if request.Metadata["mode"] != string(models.LocalSudoModeTimed) { + t.Fatalf("mode = %#v, want %q", request.Metadata["mode"], models.LocalSudoModeTimed) + } + if got, want := request.Device, "device-alpha"; got != want { + t.Fatalf("device = %#v, want %q", got, want) + } + if got, want := request.Metadata["device_id"], "device-alpha"; got != want { + t.Fatalf("device_id = %#v, want %q", got, want) + } + if !containsString(request.Role.Providers, "local-custom") { + t.Fatalf("request role providers = %#v, want provider alias included", request.Role.Providers) + } +} + +func TestBuildLocalSudoElevationRequestCommandUsesDefaultDuration(t *testing.T) { + previousCfg := cfg + t.Cleanup(func() { cfg = previousCfg }) + + cfg = newTestSudoConfig("local-elevation") + + request, err := buildLocalSudoElevationRequestForOS("linux", []string{"whoami"}, "check user", "", "device-beta") + if err != nil { + t.Fatalf("buildLocalSudoElevationRequest returned error: %v", err) + } + + if got, want := request.Workflow, models.LocalSudoCommandWorkflowName; got != want { + t.Fatalf("workflow = %q, want %q", got, want) + } + if got, want := request.Duration, models.LocalSudoCommandDuration; got != want { + t.Fatalf("duration = %q, want %q", got, want) + } + command, ok := request.Metadata["command"].([]string) + if !ok { + t.Fatalf("metadata command type = %T, want []string", request.Metadata["command"]) + } + if len(command) != 1 || command[0] != "whoami" { + t.Fatalf("metadata command = %#v, want [\"whoami\"]", command) + } + if got, want := request.Metadata["device_id"], "device-beta"; got != want { + t.Fatalf("device_id = %#v, want %q", got, want) + } +} + +func TestBuildLocalSudoElevationRequestDarwinCommandIsUnsupported(t *testing.T) { + previousCfg := cfg + t.Cleanup(func() { cfg = previousCfg }) + + cfg = newTestSudoConfig("local-elevation") + + if _, err := buildLocalSudoElevationRequestForOS("darwin", []string{"whoami"}, "check user", "", "device-beta"); err == nil { + t.Fatal("expected Darwin command-mode error") + } +} + +func TestBuildLocalSudoElevationRequestRequiresTimedDuration(t *testing.T) { + previousCfg := cfg + t.Cleanup(func() { cfg = previousCfg }) + + cfg = newTestSudoConfig("local-elevation") + + if _, err := buildLocalSudoElevationRequest(nil, "missing duration", "", "device-alpha"); err == nil { + t.Fatal("expected error for missing timed duration") + } +} + +func TestBuildLocalSudoElevationRequestPrefersLocalElevationProvider(t *testing.T) { + previousCfg := cfg + t.Cleanup(func() { cfg = previousCfg }) + + cfg = &configpkg.Config{ + Providers: configpkg.ProviderDefinitionsConfig{ + Definitions: map[string]models.ProviderConfig{ + "local": { + Name: "Local", + Provider: "local", + Enabled: true, + }, + "local-elevation": { + Name: "Local Elevation", + Provider: "local", + Enabled: true, + }, + }, + }, + Roles: configpkg.RoleConfig{ + Definitions: map[string]models.Role{ + models.LocalSudoRoleIdentifier: { + Name: "Local Sudo", + Identifier: models.LocalSudoRoleIdentifier, + Providers: []string{"local", "local-elevation"}, + Workflows: []string{models.LocalSudoTimedWorkflowName, models.LocalSudoCommandWorkflowName}, + Permissions: models.RolePermissions{ + Allow: models.RoleStatements{ + {Operations: []string{"local:sudo:*"}}, + }, + }, + }, + }, + }, + } + + request, err := buildLocalSudoElevationRequest(nil, "system maintenance", "30m", "device-alpha") + if err != nil { + t.Fatalf("buildLocalSudoElevationRequest returned error: %v", err) + } + + if got, want := request.Providers[0], "local-elevation"; got != want { + t.Fatalf("provider = %q, want %q", got, want) + } +} + +func newTestSudoConfig(providerName string) *configpkg.Config { + return &configpkg.Config{ + Providers: configpkg.ProviderDefinitionsConfig{ + Definitions: map[string]models.ProviderConfig{ + providerName: { + Name: "Local", + Provider: "local", + Enabled: true, + }, + }, + }, + Roles: configpkg.RoleConfig{ + Definitions: map[string]models.Role{ + models.LocalSudoRoleIdentifier: { + Name: "Local Sudo", + Identifier: models.LocalSudoRoleIdentifier, + Providers: []string{"local", "local-elevation"}, + Workflows: []string{models.LocalSudoTimedWorkflowName, models.LocalSudoCommandWorkflowName}, + Permissions: models.RolePermissions{ + Allow: models.RoleStatements{ + {Operations: []string{"local:sudo:*"}}, + }, + }, + }, + }, + }, + } +} + +func containsString(values []string, target string) bool { + for _, value := range values { + if value == target { + return true + } + } + return false +} + +func TestBuildLocalSudoElevationRequestRequiresConfiguredEnvironment(t *testing.T) { + previousCfg := cfg + t.Cleanup(func() { cfg = previousCfg }) + + cfg = nil + + if _, err := buildLocalSudoElevationRequest(nil, "system maintenance", "30m", ""); err == nil { + t.Fatal("expected error when config is unavailable") + } +} + +func TestSudoCommandDefaultsDeviceToCurrentMachineWhenFlagOmitted(t *testing.T) { + previousCfg := cfg + t.Cleanup(func() { cfg = previousCfg }) + + cfg = newTestSudoConfig("local-elevation") + + cmd := &cobra.Command{Use: "sudo"} + cmd.Flags().String("device", "", "") + + device, err := cmd.Flags().GetString("device") + if err != nil { + t.Fatalf("GetString(device) returned error: %v", err) + } + if cmd.Flags().Changed("device") { + t.Fatal("device flag should not be marked changed when omitted") + } + if !cmd.Flags().Changed("device") { + device = common.GetDeviceID().String() + } + + request, err := buildLocalSudoElevationRequest(nil, "system maintenance", "30m", device) + if err != nil { + t.Fatalf("buildLocalSudoElevationRequest returned error: %v", err) + } + + if got, want := request.Device, common.GetDeviceID().String(); got != want { + t.Fatalf("device = %q, want %q", got, want) + } + if got, want := request.Metadata["device_id"], common.GetDeviceID().String(); got != want { + t.Fatalf("metadata device_id = %#v, want %q", got, want) + } +} + +func TestSudoCommandPreservesExplicitEmptyDeviceFlag(t *testing.T) { + previousCfg := cfg + t.Cleanup(func() { cfg = previousCfg }) + + cfg = newTestSudoConfig("local-elevation") + + cmd := &cobra.Command{Use: "sudo"} + cmd.Flags().String("device", "", "") + if err := cmd.Flags().Set("device", ""); err != nil { + t.Fatalf("Set(device) returned error: %v", err) + } + + device, err := cmd.Flags().GetString("device") + if err != nil { + t.Fatalf("GetString(device) returned error: %v", err) + } + if !cmd.Flags().Changed("device") { + t.Fatal("device flag should be marked changed when explicitly set") + } + + if _, err := buildLocalSudoElevationRequest(nil, "system maintenance", "30m", device); err == nil { + t.Fatal("expected explicit empty device flag to remain empty and fail validation") + } +} diff --git a/docs/api/agent/configuration.md b/docs/api/agent/configuration.md index 323aa604..fdacfa92 100644 --- a/docs/api/agent/configuration.md +++ b/docs/api/agent/configuration.md @@ -56,7 +56,7 @@ Currently a stub endpoint for future pre-flight validation. ## Register Agent -Register an agent with the server. +Bootstrap agent configuration from the server. **POST** `/register` @@ -64,13 +64,18 @@ Register an agent with the server. - Server Mode Only +`/register` is a configuration/bootstrap handshake only. It returns server-managed config snapshots, but it does not publish live device routes. Running agents publish their live route directly to Temporal after bootstrap succeeds. + ### Request Body ```json { + "mode": "agent", + "identifier": "11111111-2222-3333-4444-555555555555", "environment": { - "name": "production", - "description": "Production environment configuration" + "name": "workstation-alpha", + "hostname": "workstation-alpha.example.test", + "platform": "local" } } ``` @@ -129,6 +134,11 @@ The registration response contains the complete configuration for the agent, inc If the upstream server has a newer version of the configuration, the agent will update its local configuration to match the server's state. This ensures that policies and configurations are consistent across the infrastructure. +For device-local workflows: + +- servers publish device definitions/policy to the shared device-definition registry +- agents publish live `device_id -> task_queue` route state to the shared device-route registry + ## Post-flight Check Validate configuration after registration. diff --git a/docs/api/agent/elevation.md b/docs/api/agent/elevation.md index f6e4cc92..8f698e6d 100644 --- a/docs/api/agent/elevation.md +++ b/docs/api/agent/elevation.md @@ -187,3 +187,7 @@ Raw encrypted workflow state or task token for resuming workflows. - Used internally by workflow engine to resume paused workflows - State parameter contains encrypted workflow context - Supports both query parameter and body-based resumption + +## Related Guides + +- [Local Sudo Usage](local-sudo.md) diff --git a/docs/api/agent/local-sudo.md b/docs/api/agent/local-sudo.md new file mode 100644 index 00000000..97b98813 --- /dev/null +++ b/docs/api/agent/local-sudo.md @@ -0,0 +1,97 @@ +--- +layout: default +title: Local Sudo +parent: Agent +grand_parent: API Reference +nav_order: 5 +--- + +# Local Sudo + +Local sudo lets thand request short-lived privileged access on a specific registered device. + +Use this feature when you want a normal thand approval workflow to grant temporary local administrative access on a machine that is running a thand agent. + +## Request Types + +Local sudo supports two modes: + +- timed access, which grants sudo for a bounded lease +- command mode, which runs a specific command and cleans up immediately afterward + +On macOS broker v1, only timed access is supported. Command mode remains available on other platforms that still use the legacy local-provider path. + +For the supported macOS native developer loop, see `DEVELOPMENT.md`. The repo now uses XcodeGen plus `xcodebuild` through `make build`, `make test`, and `sudo -E make install-macos-privilege-services-dev`. + +## Requesting Timed Local Sudo + +CLI example: + +```bash +thand request sudo --device 11111111-2222-3333-4444-555555555555 --duration 30m --reason "System maintenance" +``` + +If `--device` is omitted, the CLI defaults to the current machine's canonical `device_id`. +If `--device` is provided explicitly, the CLI uses that exact value, even if it is empty. + +Static web example: + +```text +/api/v1/elevate?role=local_sudo&device=11111111-2222-3333-4444-555555555555&duration=PT30M&reason=System+maintenance +``` + +## Requesting Command Mode + +CLI example: + +```bash +thand request sudo --device 11111111-2222-3333-4444-555555555555 --command softwareupdate --command -i --command -a --reason "Install updates" +``` + +Command mode defaults to a short duration window and removes the local grant immediately after the command finishes. + +## Device Availability + +If the target device is offline, local sudo does not fail immediately. + +Authorize waits for a fresh device route for a bounded window: + +- up to the requested sudo duration +- capped at 5 minutes + +If the device does not reconnect in that window, the request fails instead of succeeding unexpectedly later. + +## Workflow Behavior + +Local sudo resolves device-local execution details such as the target device and local account mapping as part of the internal execution-planning work that runs at the start of `authorize`. + +That planning step reads shared device policy from the Temporal-backed device-definition registry rather than depending on the handling server having the target device configured locally. + +Workflows can require macOS local presence before authorization by adding a `thand: approvals` notifier with `provider: local-presence`. The server routes the challenge to the selected `device_id`; the macOS agent then asks the signed helper to run `deviceOwnerAuthentication`, which can be satisfied by Touch ID, Apple Watch, or passcode depending on local policy. The returned result is applied through the same approval evaluation path as Slack and email callbacks. + +If you are authoring or reviewing workflows, see [Workflow Tasks](/configuration/workflows/tasks.html) for the `authorize` lifecycle and execution-planning behavior. + +## Revoke Behavior + +Timed revoke is reconciliation-oriented. + +- if the device is online, revoke is dispatched immediately +- if the device is offline, revoke remains pending until the device reconnects and the server can reconcile state + +Timed access is still expected to expire locally on the device based on the local lease. The pending revoke exists so the workflow can converge and leave an accurate audit trail. + +On macOS broker v1, expiry and revoke are enforced by the native broker even if the agent disconnects after authorization. + +## Copy / Resume URLs + +The static request page preserves the `device` field in copied request URLs so the target device stays attached when the request is reopened later. + +The `device` value is the canonical `device_id`. Operators can print the current machine's device ID with `thand config device-id`. + +Live device routing is also keyed by that same `device_id`. + +## Related Docs + +- [Local Sudo Configuration](/configuration/local-sudo.html) +- [Elevation (Access Request) Endpoints](/api/agent/elevation.html) +- [Workflow Tasks](/configuration/workflows/tasks.html) diff --git a/docs/configuration/index.md b/docs/configuration/index.md index bcc37590..600533e5 100644 --- a/docs/configuration/index.md +++ b/docs/configuration/index.md @@ -85,3 +85,4 @@ export THAND_PROVIDERS_AWS_REGION="us-west-2" - **[Providers](providers)** - Provider configurations - **[Roles](roles)** - Role definitions and mappings - **[Workflows](workflows)** - Custom approval workflows +- **[Local Sudo](local-sudo)** - Device-local sudo configuration diff --git a/docs/configuration/local-sudo.md b/docs/configuration/local-sudo.md new file mode 100644 index 00000000..93c9ad56 --- /dev/null +++ b/docs/configuration/local-sudo.md @@ -0,0 +1,182 @@ +--- +layout: default +title: Local Sudo +parent: Configuration +nav_order: 8 +--- + +# Local Sudo Configuration + +Local sudo is configured in two parts: + +- the local provider enables the device-local execution backend +- per-device policy decides who can request sudo on a given machine and which local account to use + +## Required Configuration + +Minimal device policy example: + +```yaml +devices: + device-alpha: + device_id: "11111111-2222-3333-4444-555555555555" + name: "Example Workstation" + enabled: true + local_elevation: + enabled: true + accounts: + - email: "user@example.com" + local_username: "localuser" +``` + +Production agents use a generated machine-derived `device_id`. You can print the current machine's value with: + +```bash +thand config device-id +``` + +For deterministic dev/CI setups, non-production builds may override the generated value with `THAND_DEV_DEVICE_ID_OVERRIDE`. That override path is intentionally not available in production binaries. + +## Provider Configuration + +If you are using the built-in local provider defaults, you usually do not need to restate the provider stanza at all. + +The embedded `local_sudo` role ships with the timed sudo workflow only. Command mode remains available to custom roles and workflows, but it is not part of the default local sudo role. + +Example: + +```yaml +providers: + local-elevation: + provider: local + enabled: true + local-presence: + provider: local-presence + enabled: true + local-notification: + provider: local-notification + enabled: true +``` + +For full local macOS integration testing, install the Apple Development-signed privilege-services bundle with: + +```bash +export APPLE_TEAM_ID=ABCDE12345 +sudo -E make install-macos-privilege-services-dev +``` + +That installs: + +- `/Applications/ThandPrivilegeServices.app` +- `/Library/Application Support/Thand/PrivilegeBroker/bin/thand-macos-privilege-brokerctl` + +The installed macOS privilege-services payload is normalized to `root:wheel` ownership and non-user-writable modes during install. + +If you only need unsigned layout verification and not real `SMAppService` registration, you can instead use: + +```bash +THAND_MACOS_SKIP_SIGNING=1 make package-macos-privilege-services-dev +``` + +That unsigned mode is limited to bundle layout verification and is not a supported broker runtime path. Full local `SMAppService` and broker testing is expected to use the Apple Development-signed install flow above. + +On macOS v1, timed sudo is brokered through the native privilege-services app bundle and daemon. The broker owns sudoers fragments, lease persistence, expiry, and revocation. + +If a workflow uses an approval notifier with `provider: local-presence`, the same signed helper can trigger a macOS device-owner authentication prompt on the routed device before `authorize` runs. The result is recorded through the normal approvals path, alongside Slack and email approval callbacks. This requires the target agent to be online in an interactive login session; CI and unit tests mock the LocalAuthentication result rather than invoking biometrics. + +Workflows can also opt into Thand-managed macOS notifications with `provider: local-notification` in `thand: notify`, approval notifiers, `authorize.with.notifiers`, or `revoke.with.notifiers`. These notifications are routed to the target `device_id` and posted by the signed helper through `UNUserNotificationCenter`. Broker-triggered lease notifications remain enabled separately for now. + +## Account Mapping + +Per-device account mappings decide which local account receives sudo. + +You can match by: + +- `identity` +- `email` +- `username` + +Example: + +```yaml +devices: + device-alpha: + device_id: "11111111-2222-3333-4444-555555555555" + enabled: true + local_elevation: + enabled: true + accounts: + - identity: "identity-abc123" + local_username: "localuser" + - email: "user@example.com" + local_username: "localuser" +``` + +## Allowed Modes + +You can restrict a device to specific local-sudo modes. + +```yaml +devices: + device-alpha: + device_id: "11111111-2222-3333-4444-555555555555" + local_elevation: + enabled: true + allowed_modes: + - timed + - command +``` + +If `allowed_modes` is omitted, both timed and command mode are allowed at the device-policy layer, but the embedded `local_sudo` role still exposes only the timed workflow by default. + +On macOS v1, command mode is intentionally disabled while the broker only supports timed sudoers grants. + +## Guardrails + +You can add guardrails for unsafe local targets. + +```yaml +devices: + device-alpha: + device_id: "11111111-2222-3333-4444-555555555555" + local_elevation: + enabled: true + denied_usernames: + - root + - daemon + - nobody + allowed_uid_ranges: + - "1000-60000" +``` + +`denied_usernames` blocks sensitive local accounts even if they are mapped accidentally. + +`allowed_uid_ranges` constrains requests to human-style local accounts instead of system accounts. + +## Operational Notes + +- local sudo routes only to fresh live agent registration state +- live routes are published by agents to the shared Temporal device-route registry +- local-sudo execution planning reads device policy from the shared Temporal device-definition registry +- the device identity is the canonical `device_id` +- operators can print the local machine device ID with `thand config device-id` +- `thand request sudo` defaults to the current machine when `--device` is omitted +- static `execution_target` routing is no longer used +- local-sudo execution planning runs internally at the start of `authorize` +- authorize waits for the device for a bounded window +- on macOS, revoke and expiry are enforced locally by the broker even if the agent disconnects + +## Security Notes + +The current generated `device_id` is enough for routing and local development, but it is not yet a cryptographically enrolled device identity. + +The macOS privilege broker improves local trust boundaries, but it does not replace future enrolled device identity. + +Future work is expected to add stronger enrolled device identity and a more formal device control plane. + +## Related Docs + +- [Local Sudo Usage](/api/agent/local-sudo.html) +- [Workflow Tasks](/configuration/workflows/tasks.html) +- [Configuration](/configuration/) +- repo developer guide: `DEVELOPMENT.md` diff --git a/docs/configuration/workflows/index.md b/docs/configuration/workflows/index.md index a7f4ba85..59947ec6 100644 --- a/docs/configuration/workflows/index.md +++ b/docs/configuration/workflows/index.md @@ -17,9 +17,10 @@ Thand workflows orchestrate the complete lifecycle of access requests: 1. **Validation** - Verify request validity and user permissions 2. **Approval** - Route requests through approval chains with notifications -3. **Authorization** - Grant temporary access to requested resources -4. **Monitoring** - Track usage and detect policy violations -5. **Revocation** - Remove access when complete or violated +3. **Execution Planning** - Compile the final request into an execution plan +4. **Authorization** - Grant temporary access to requested resources +5. **Monitoring** - Track usage and detect policy violations +6. **Revocation** - Remove access when complete or violated Workflows leverage the [Serverless Workflow DSL](https://serverlessworkflow.io/specification/) for standardized process definition while providing custom Thand-specific tasks for access control operations. @@ -57,6 +58,19 @@ workflows: - grant: { thand: authorize } ``` +## Execution Planning + +Access-granting workflows should treat execution planning as part of the standard `authorize` lifecycle. + +`authorize` runs an internal execution-plan activity after validation, approvals, and any other step that can still change the final request. That activity compiles the request into an internal execution plan that `authorize` and `revoke` later consume. + +In practice, the safe patterns are: + +- `validate -> authorize` +- `validate -> approvals -> authorize` + +If a workflow grants access, do not put any request-shaping step after `authorize`, because the execution plan is snapped there. + ## Workflow Structure ### Basic Configuration @@ -132,7 +146,7 @@ workflows: approved: authorize denied: deny-notification then: deny-notification - + # Step 3: Grant access if approved - authorize: thand: authorize @@ -287,6 +301,7 @@ workflows: do: - validate: { thand: validate } - approve: { thand: approvals } + - authorize: { thand: authorize } ``` ## Workflow Patterns @@ -399,12 +414,11 @@ workflows: - when: '${ .duration > "PT4H" }' then: manager-approval - when: '${ .user.department == "security" }' - then: auto-approve + then: authorize default: standard-approval - security-approval: { thand: approvals, then: authorize } - manager-approval: { thand: approvals, then: authorize } - - auto-approve: { thand: authorize, then: end } - standard-approval: { thand: approvals, then: authorize } - authorize: { thand: authorize, then: end } ``` diff --git a/docs/configuration/workflows/tasks.md b/docs/configuration/workflows/tasks.md index ae513c5e..351b615f 100644 --- a/docs/configuration/workflows/tasks.md +++ b/docs/configuration/workflows/tasks.md @@ -127,6 +127,7 @@ The `approvals` task handles approval workflows by sending notifications to appr thand: approvals with: approvals: number # Required approvals + timeout: duration # Optional overall deadline; requires on.timeout notifiers: # Notification configuration key: provider: string @@ -135,6 +136,7 @@ The `approvals` task handles approval workflows by sending notifications to appr on: approved: target-step denied: target-step + timeout: target-step # Required when with.timeout is set then: default-step ``` @@ -143,7 +145,32 @@ The `approvals` task handles approval workflows by sending notifications to appr | Parameter | Type | Required | Description | |-----------|------|----------|-------------| | `approvals` | number | Yes | Number of approvals required | -| `notifiers` | object | Yes | Notification configuration | +| `timeout` | duration | No | Overall approval deadline. Requires `on.timeout` when set | +| `notifiers` | object | No | Notification configuration | + +### Local Presence Configuration + +Use a normal notifier entry with `provider: local-presence` to add a device-routed macOS owner-authentication prompt. The routed agent reports the prompt result back into the same approval result path used by Slack and email callbacks, so an approved local presence check can satisfy the configured `approvals` count. + +```yaml +notifiers: + touch_id: + provider: local-presence + device: "${ .device }" + prompt: "Approve this access request on your Mac" +``` + +Device resolution uses this order: + +1. the notifier entry `device` +2. the elevation request `device` +3. request metadata `device_id` + +Local presence uses the approvals task's remaining `with.timeout` deadline for device routing and the native macOS prompt. If the target device is offline or the prompt times out, the task routes to `on.timeout` instead of treating the result as a denial. User cancel or authentication failure is a denial and routes to `on.denied`. + +Local presence is not ordered ahead of other notifiers inside one approvals task. If you want "try Touch ID first, then fall back to humans," model that as two approvals tasks: a short local-presence task whose `on.timeout` branch points to a second Slack/email approvals task. + +The check requires a live macOS agent in an interactive login session. It proves that macOS device-owner authentication succeeded on the routed device; it is not a cryptographic enrolled-device attestation. ### Notifiers Configuration @@ -163,6 +190,8 @@ notifiers: |----------|---------|---------------| | `slack` | Slack notifications | Channel ID: `C0123456789` or User ID | | `email` | Email notifications | Email address | +| `local-presence` | macOS device-owner approval prompt | `device_id` | +| `local-notification` | macOS local user notification | `device_id` | ### Flow Control @@ -173,11 +202,12 @@ The approvals task uses the `on` directive for conditional flow: thand: approvals with: approvals: 2 + timeout: 15m notifiers: ... on: approved: grant-access # If approved denied: send-denial # If denied - then: timeout-handler # If insufficient approvals (loops back) + timeout: timeout-handler # If no final answer before timeout ``` ### Approval Logic @@ -188,7 +218,8 @@ The approvals task implements the following logic: 3. Collects approvals in the workflow context 4. If any approval is `false` (denied), routes to the `denied` state 5. If the number of `true` approvals meets the required count, routes to the `approved` state -6. Otherwise, loops back to wait for more approvals +6. If `with.timeout` expires before approval or denial, routes to the `timeout` state +7. Otherwise, loops back to wait for more approvals ### Examples @@ -198,6 +229,7 @@ The approvals task implements the following logic: thand: approvals with: approvals: 1 + timeout: 30m notifiers: slack: provider: slack @@ -207,7 +239,7 @@ The approvals task implements the following logic: on: approved: grant-access denied: deny-request - then: deny-request + timeout: deny-request ``` **Email Approval** @@ -226,6 +258,77 @@ The approvals task implements the following logic: denied: denied ``` +**macOS Local Presence-Gated Sudo** +```yaml +- presence: + thand: approvals + with: + approvals: 1 + timeout: 2m + notifiers: + touch_id: + provider: local-presence + device: "${ .device }" + prompt: "Approve this sudo request on your Mac" + on: + approved: authorize + denied: denied + timeout: human_approval +``` + +## Execution Planning + +`authorize` runs an internal execution-plan activity that compiles the current workflow request into the execution plan later used by `authorize` and `revoke`. + +Treat `authorize` as the last request-shaping step before access is granted. It should run only after approvals, form collection, and other workflow logic that might change the final request shape. + +### Execution-Plan Process + +The execution-plan activity: + +1. **Reads** the normalized elevate request from workflow context +2. **Resolves** the provider, identity, device, and local policy data needed for execution +3. **Compiles** one or more provider authorization requests into an internal execution plan +4. **Stores** that execution plan in workflow context/history for later `authorize` and `revoke` steps + +### Requirements and Constraints + +- execution planning is required before provider authorization work starts +- `authorize` should appear immediately after approvals or any other request-shaping step +- `authorize` and `revoke` depend on the recorded execution plan and do not rebuild it later +- `revoke` fails if the execution plan is missing + +### Failure Behavior + +- if execution planning cannot compile the request, the workflow fails before authorization starts +- if `revoke` runs without a recorded execution plan, it fails instead of trying to recover implicitly + +### Examples + +**Validation, Approval, and Authorization** +```yaml +- validate: + thand: validate + then: approvals + +- approvals: + thand: approvals + on: + approved: authorize + denied: denied + then: denied + +- authorize: + thand: authorize +``` + +**Validation and Authorization Without Approval** +```yaml +- validate: + thand: validate + then: authorize +``` + ## authorize The `authorize` task grants temporary access to the requested role and resources. @@ -250,14 +353,17 @@ The `authorize` task grants temporary access to the requested role and resources The authorize task: -1. **Validates** the request is approved (checks workflow context) -2. **Creates** temporary credentials/access across all specified providers -3. **Registers** the session information -4. **Returns** authorization details with timestamps +1. **Builds or reuses** the recorded execution plan from workflow context/history +2. **Validates** the request is approved (checks workflow context) +3. **Creates** temporary credentials/access across all specified providers +4. **Registers** the session information +5. **Returns** authorization details with timestamps ### Authorization Context -The authorize task checks if the request has been approved by looking at the workflow context. If already approved, it returns basic model output with timestamps. +The authorize task checks if the request has been approved by looking at the workflow context. It also snapshots the final request into the execution plan it and `revoke` later consume. + +For any workflow that grants access, treat `authorize` as the first step that may perform provider-side effects. ### Examples @@ -352,8 +458,8 @@ The `revoke` task removes granted access and cleans up temporary credentials. The revoke task: -1. **Validates** the elevate request from workflow context -2. **Iterates** through all providers and identities +1. **Reads** the recorded execution plan from workflow context/history +2. **Uses** the stored authorization request shape to build revocation work 3. **Calls** provider-specific revocation methods 4. **Logs** revocation events 5. **Returns** revocation status with timestamp @@ -386,12 +492,10 @@ The `notify` task sends notifications to users, administrators, or external syst - notify: thand: notify with: - approvals: number # Number of approvals needed - notifiers: # Notification configuration - key: - provider: string - to: string - message: string + provider: string + to: string + message: string + device: string # Optional; used by local-notification then: next-step ``` @@ -399,8 +503,10 @@ The `notify` task sends notifications to users, administrators, or external syst | Parameter | Type | Required | Description | |-----------|------|----------|-------------| -| `approvals` | number | Yes | Number of approvals required (for approval notifications) | -| `notifier` | object | Yes | Notification configuration | +| `provider` | string | Yes | Notification provider | +| `to` | string or array | Yes | Recipient identity, channel, or email address | +| `message` | string | No | Notification body | +| `device` | string | No | Target device for `local-notification` | ### Notification Process @@ -414,6 +520,9 @@ The notify task: - **Slack**: Sends rich notifications with approval buttons - **Email**: Sends email notifications +- **Local Notification**: Sends a macOS user notification on a routed agent device with `provider: local-notification` + +`local-notification` requires a live macOS agent and a target device. Device resolution follows the explicit notifier `device` field first, then the request `device`, then request metadata `device_id`. The same routing applies when `local-notification` is used in `thand: notify`, `authorize.with.notifiers`, `revoke.with.notifiers`, or `approvals.with.notifiers` approval prompts. It uses the signed helper and `UNUserNotificationCenter`; it does not replace broker-triggered sudo lease notifications, which remain in place for now. ### Examples @@ -422,12 +531,20 @@ The notify task: - slack-notify: thand: notify with: - approvals: 1 - notifiers: - slack: - provider: slack - to: "C0123456789" - message: "Access granted to user" + provider: slack + to: "C0123456789" + message: "Access granted to user" +``` + +**macOS Local Notification** +```yaml +- local-notify: + thand: notify + with: + provider: local-notification + to: ${ $context.user.email } + device: "${ .device }" + message: "Your local sudo request was approved" ``` **Note**: The notify task is primarily used internally by the approvals task. For standalone notifications, consider using standard Serverless Workflow `call` tasks to external APIs. @@ -576,7 +693,13 @@ Error: authorization failed for user 'alice' role 'admin' ``` **Solution**: Verify the request has been properly approved and the user has permission to request the role. -#### 4. Monitoring Limitations +#### 4. Missing Execution Plan +``` +Error: failed to get execution plan from workflow context +``` +**Solution**: Ensure `thand: revoke` only runs on paths where `thand: authorize` has already executed and recorded the execution plan. + +#### 5. Monitoring Limitations ``` Error: Monitoring is only supported with temporal ``` @@ -591,4 +714,4 @@ logging: level: debug ``` -Check workflow context to understand task inputs and state. \ No newline at end of file +Check workflow context to understand task inputs and state. diff --git a/docs/development/index.md b/docs/development/index.md index a5f846b7..24736c4d 100644 --- a/docs/development/index.md +++ b/docs/development/index.md @@ -9,3 +9,12 @@ description: Developer documentation for Thand Agent # Development Documentation for developers contributing to or extending the Thand Agent. + +## Config Mutation Invariant + +Configuration definition maps should be treated as immutable snapshots. +When config changes, prefer replacing whole entries or whole definition maps +instead of mutating nested state in place. Some older code paths still perform +mutation-prone updates; keep new code aligned with the invariant and track +cleanup of legacy exceptions in follow-up issues rather than extending them. +Current cleanup work is tracked in [#306](https://github.com/thand-io/agent/issues/306). diff --git a/docs/getting-started.md b/docs/getting-started.md index bc4f31ee..08507977 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -165,6 +165,11 @@ Request temporary sudo access on your local machine: thand-agent request sudo --duration 30m --reason "System maintenance" ``` +For device-targeted usage and configuration details, see: + +- [Local Sudo Usage](api/agent/local-sudo.md) +- [Local Sudo Configuration](configuration/local-sudo.md) + ## Next Steps - **[Environment Setup](../environments/)** - Configure Thand for specific environments diff --git a/docs/internal/adr-device-routing-phase-1.md b/docs/internal/adr-device-routing-phase-1.md new file mode 100644 index 00000000..0036b705 --- /dev/null +++ b/docs/internal/adr-device-routing-phase-1.md @@ -0,0 +1,72 @@ +--- +layout: default +title: ADR: Device Routing Phase 1 +parent: Internal +nav_order: 2 +--- + +# ADR: Device Routing Phase 1 + +## Status + +Accepted. + +## Context + +Device-local workflows need a way to target a specific machine. + +At the same time, the project is not yet ready to introduce full cryptographic device identity or a dedicated device control plane. + +## Decision + +Phase 1 uses: + +- first-class device definitions on the server +- canonical `device_id` as the device-matching key +- `/register` as bootstrap/config sync only +- live route tracking from agent-published device-route state +- shared Temporal device-definition and device-route registries on `thand_device_registry` +- periodic route refresh from running agents +- fresh-route checks for device-targeted workflow dispatch + +It explicitly does not use: + +- provider tenants as device identifiers +- indefinite waiting for device-targeted authorize steps + +## Rationale + +This choice gives us the smallest useful device substrate that: + +- removes stale static routing +- supports reconnect-aware device execution +- keeps the design generic for future device-local workflows +- leaves room for a later secure identity redesign + +## Alternatives Considered + +### Model devices as provider tenants + +Rejected because tenants are provider-scoped account concepts, not machine execution concepts. + +### Require immediate online presence with no waiting + +Rejected because short outages and restarts are normal. A bounded wait is a better operator experience for authorize, while revoke can retry for reconciliation. + +### Solve cryptographic device identity first + +Rejected for phase 1 because it would block useful device plumbing behind a larger security redesign. + +## Consequences + +Positive consequences: + +- simpler config +- more honest routing model +- reusable device-targeted execution layer + +Negative consequences: + +- the current generated `device_id` is still a client-presented identity and not yet a cryptographically enrolled device credential +- device config and routing now depend on internal shared Temporal registries that need future hardening and operational polish +- later phases will need migration work for stronger enrollment diff --git a/docs/internal/device-model.md b/docs/internal/device-model.md new file mode 100644 index 00000000..14ae3e18 --- /dev/null +++ b/docs/internal/device-model.md @@ -0,0 +1,164 @@ +--- +layout: default +title: Device Model +parent: Internal +nav_order: 1 +--- + +# Device Model + +This document describes the long-term device architecture for thand, the first phase now implemented in code, and the work intentionally deferred. + +## Why Devices Exist + +Local execution targets such as laptops, desktops, and servers have different lifecycle and security requirements than: + +- users, which represent human principals +- providers, which represent integrations such as AWS or JumpCloud +- tenants, which represent provider-scoped accounts or resource containers + +A device is therefore modeled as a first-class server-managed object. Devices let the server answer questions like: + +- which machine should receive a device-local workflow +- which device-local policy applies to that machine +- whether the machine is currently connected and routable + +Operators can print the local machine's current device ID with `thand config device-id`. +Non-production builds may override the generated value for deterministic testing, but production binaries always use the generated machine-derived `device_id`. + +## What A Device Is + +In the current model, a device has: + +- a stable device ID +- human-readable metadata such as `name` and `description` +- optional per-device local-elevation policy + +Runtime connection state is tracked separately from static device policy. That runtime state currently includes: + +- `task_queue` +- `last_seen_at` +- derived freshness / connected status + +## Why Devices Are Not Tenants + +Provider tenants and devices look superficially similar because both can affect routing and authorization scope, but they solve different problems. + +Tenants are provider-scoped. A tenant says which account or org inside a provider a request applies to. Devices are execution-scoped. A device says which machine should run a workflow or local action. + +Using tenants for devices would overload provider semantics with machine lifecycle concerns such as: + +- agent registration +- live route freshness +- local reconciliation after reconnect +- future privileged-helper transport + +That coupling would make both models harder to reason about, so devices remain separate. + +## Target Architecture + +The intended architecture is: + +1. The server owns device definitions and device policy. +2. An agent represents one device, running as a system-level service rather than a per-user helper. +3. `/register` bootstraps config only; running agents publish live route state directly to Temporal. +4. Device-targeted workflows route through that live route only. +5. Device-local capabilities such as local sudo are layered on top of the device substrate. + +Today the canonical `device_id` is machine-derived. Longer term, device registration should use a stronger enrolled identity, but keep the same `device_id` abstraction boundary. + +Device-targeted workflows also rely on a separate execution-planning phase before authorization. That workflow-level contract is documented in [Execution Planning](/internal/execution-planning.html). + +## Phase 1: What Is Implemented Now + +Phase 1 establishes the basic device substrate without yet solving strong device identity. + +Implemented now: + +- first-class `Device` definitions in config +- live device connection state tracked in the shared Temporal device-route registry +- shared device definitions tracked in a Temporal device-definition registry +- periodic device registration refresh from the agent +- route freshness checks using `last_seen_at` +- device-targeted provider child workflows using a fresh live route +- bounded waiting for authorize when a device is temporarily offline +- retrying revoke reconciliation when the device is offline + +Not yet implemented: + +- cryptographic device enrollment or proof-of-possession identity behind the existing `device_id` abstraction +- a dedicated control-plane service for device config +- device discovery UI or richer device selection UX +- a privileged local helper split from the main agent + +Phase 1 routes only from fresh live registration state. + +## Current Routing Model + +Today, routing works like this: + +1. An agent registers with the server. +2. The server returns bootstrap/config data to the agent. +3. The agent publishes its current `task_queue` and `last_seen_at` to the shared Temporal device-route registry on `thand_device_registry`. +4. Servers publish configured device definitions to the shared Temporal device-definition registry on `thand_device_registry`. +5. Device-targeted workflows query shared device policy during execution planning and ask for a fresh route before dispatch. +6. If the route is missing or stale, authorize waits for a bounded window and revoke keeps retrying for reconciliation. + +This gives us a cleaner failure model: + +- authorize should not succeed much later than requested +- revoke should converge once the device reconnects +- timed local enforcement should not depend solely on centralized connectivity + +## Consequences of the Current Design + +The current phase-1 design has a few important consequences: + +- devices are now a generic execution substrate, not a sudo-only feature +- routing depends on liveness, not static config +- agents are treated as per-device services, not per-user services +- the machine-derived `device_id` is now the single routing identity used across registration, planning, and dispatch + +## Known Shortcomings + +The biggest gap is still device identity hardening. + +Today the server matches a connecting agent to a device through the generated `device_id`. That is enough for phase 1 plumbing and local development, but it is not strong enough for a final design because it is still based on client-presented identity. + +Other gaps: + +- no dedicated control-plane API for device configuration +- no secure enrollment story yet +- shared device registries are still internal Temporal workflows rather than a broader device control-plane service +- no independent privileged helper transport yet on Linux or Windows +- macOS now has an initial native privilege-services split, with an app-managed login item, broker daemon, and brokerctl bridge for timed sudoers grants +- no explicit multi-agent-per-device design, because the current assumption is one system agent per device + +## Future Phases + +Future work should cover at least: + +### Strong device identity + +- enrolled device credentials +- challenge / proof-of-possession registration +- authenticated binding between device record and live route + +### Dedicated device config distribution + +- server-managed per-device policy delivery +- explicit control-plane lifecycle for devices +- eventual separation between interactive user login and device bootstrap + +### Privileged local helper + +- macOS LaunchDaemon broker plus per-user notification helper +- OS-native trust checks between the unprivileged agent, broker, and notifier +- narrow local lease/enforcer contract with persisted expiry and restart reconciliation +- future Linux and Windows helpers that match the same broker client abstraction + +### Better UX + +- device discovery APIs +- device picker UX +- clearer offline / reconnect status in local-device workflows diff --git a/docs/internal/execution-planning.md b/docs/internal/execution-planning.md new file mode 100644 index 00000000..564b0c30 --- /dev/null +++ b/docs/internal/execution-planning.md @@ -0,0 +1,81 @@ +--- +layout: default +title: Execution Planning +parent: Internal +nav_order: 2 +--- + +# Execution Planning + +This document describes the internal `execution_plan` contract and the rules that `authorize` and `revoke` rely on. + +## Why Execution Planning Exists + +The workflow engine needs a deterministic point where a request stops being user-facing intent and becomes execution-ready work. + +That planning step now happens inside a single Temporal activity invoked by `authorize`. The activity: + +- reads the normalized request after validation and approvals are complete +- resolves the provider, identity, and device data needed for execution +- materializes provider-native authorization requests +- stores the resulting `execution_plan` in workflow context/history + +This keeps mutable lookups out of workflow code while still letting `authorize` and `revoke` stay generic. + +Device-local request shaping is handled by internal execution-plan decorators. That keeps action-specific logic, such as local sudo device-policy enrichment, together without teaching the Temporal activity about individual request types. + +## What the Execution Plan Contains + +The execution plan is an immutable execution snapshot for the rest of the workflow. It contains one or more canonical authorization requests, already shaped for later provider execution. + +Each entry includes: + +- a stable `EntryID` +- the provider name +- the canonical `device_id` used for routing +- a fully materialized provider authorization request + +The plan is an internal contract. It is not a user-facing API and should not be treated as a public workflow output. + +## Execution Contract + +The execution contract is: + +1. workflow input and approvals produce the final request intent +2. `authorize` calls the execution-plan activity once +3. that activity writes `execution_plan` into workflow context/history +4. `authorize` consumes the recorded plan +5. `revoke` later consumes the same recorded plan + +`authorize` is the only task that may create the plan. `revoke` must reuse the recorded plan and fails clearly if it is missing. + +## Routing Rule + +Routing stays intentionally simple: + +- if `DeviceID == ""`, execution stays on the parent workflow queue +- if `DeviceID != ""`, execution is device-scoped and dispatch waits for a fresh route to that device + +Execution planning is responsible for deciding whether a request becomes device-scoped by setting `DeviceID` on the stored authorization request. + +`authorize` does not need to know why the request is device-scoped. It only routes based on `DeviceID`. + +## Ordering Requirements + +Execution planning is required before any access-granting provider work starts, but it is not a user-facing workflow task anymore. + +The intended ordering is: + +- `validate -> authorize` +- `validate -> approvals -> authorize` + +Put `authorize` after approvals, forms, or any other step that can still change the final request shape. That guarantees the execution-plan activity snapshots the final request, not an intermediate one. + +## Failure Semantics and Constraints + +- execution planning is the snapshot point for request shaping +- if execution planning fails, the workflow fails before access is granted +- if policy changes after the plan is recorded, the in-flight workflow keeps using the recorded snapshot +- `revoke` depends on the previously recorded request shape and does not attempt to infer it later + +This separation keeps the execution model predictable and makes failure points easier to reason about. diff --git a/docs/internal/index.md b/docs/internal/index.md new file mode 100644 index 00000000..65c1a9e9 --- /dev/null +++ b/docs/internal/index.md @@ -0,0 +1,15 @@ +--- +layout: default +title: Internal +nav_order: 7 +has_children: true +--- + +# Internal Documentation +{: .no_toc } + +Architecture notes, ADRs, and implementation guidance for maintainers. + +- [Device Model](/internal/device-model.html) +- [Execution Planning](/internal/execution-planning.html) +- [ADR: Device Routing Phase 1](/internal/adr-device-routing-phase-1.html) diff --git a/examples/workflows/README.md b/examples/workflows/README.md index e0851cf5..bd3005ca 100644 --- a/examples/workflows/README.md +++ b/examples/workflows/README.md @@ -5,3 +5,5 @@ https://github.com/serverlessworkflow/specification/blob/v1.0.0/dsl-reference.md lets the customer define complex workflows to achive auth flows. these can either be defined by the user or we can use AI to generate these workflows on the fly for the AI to execute. + +Use `provider: local-presence` inside `thand: approvals` notifiers to gate local sudo workflows with macOS device-owner authentication. diff --git a/internal/api/elevate.go b/internal/api/elevate.go index de377abb..33050bc3 100644 --- a/internal/api/elevate.go +++ b/internal/api/elevate.go @@ -6,7 +6,9 @@ import ( "context" "errors" "fmt" + "strings" + swfCtx "github.com/serverlessworkflow/sdk-go/v3/impl/ctx" "github.com/sirupsen/logrus" "github.com/thand-io/agent/internal/models" sdkConstants "github.com/thand-io/agent/sdk/constants" @@ -56,6 +58,10 @@ func (s *Service) Elevate(ctx context.Context, input ElevationInput) (*models.Wo request := input.Request + if err := models.NormalizeLocalSudoRequest(&request, s.cfg.GetProviderDefinitions()); err != nil { + return nil, fmt.Errorf("failed to normalize elevation request: %w", err) + } + if input.User != nil { exportableSession := &models.ExportableSession{ Session: input.User, @@ -115,6 +121,13 @@ func (s *Service) Resume(ctx context.Context, input ResumeInput) (*models.Elevat workflowTask, err := s.workflows.ResumeWorkflow(workflow) if err != nil { + if isAlreadyCompletedResumeError(err) { + logrus.WithFields(logrus.Fields{ + "workflow_id": workflow.GetWorkflowID(), + }).Debug("elevation resume: workflow already completed") + workflow.Status = swfCtx.CompletedStatus + return workflow, nil + } return nil, fmt.Errorf("failed to resume workflow: %w", err) } @@ -128,3 +141,10 @@ func (s *Service) Resume(ctx context.Context, input ResumeInput) (*models.Elevat return workflowTask, nil } + +func isAlreadyCompletedResumeError(err error) bool { + if err == nil { + return false + } + return strings.Contains(strings.ToLower(err.Error()), "workflow execution already completed") +} diff --git a/internal/api/elevate_test.go b/internal/api/elevate_test.go index adf1996f..40206c3c 100644 --- a/internal/api/elevate_test.go +++ b/internal/api/elevate_test.go @@ -7,6 +7,7 @@ import ( "time" cloudevents "github.com/cloudevents/sdk-go/v2" + swfCtx "github.com/serverlessworkflow/sdk-go/v3/impl/ctx" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/thand-io/agent/internal/config" @@ -299,3 +300,23 @@ func TestResume_RunnerError(t *testing.T) { require.Error(t, err) assert.ErrorIs(t, err, sentinel) } + +func TestResume_WorkflowAlreadyCompletedReturnsExistingWorkflow(t *testing.T) { + task := newElevateTask() + runner := &mockRunner{ + resumeFn: func(_ *models.ElevateWorkflowTask) (*models.ElevateWorkflowTask, error) { + return nil, errors.New("failed to signal workflow: workflow execution already completed") + }, + } + svc := newTestService(true, runner) + + result, err := svc.Resume(context.Background(), ResumeInput{ + Workflow: task, + User: &models.User{Email: "eve@example.com"}, + }) + + require.NoError(t, err) + require.NotNil(t, result) + assert.Same(t, task, result) + assert.Equal(t, swfCtx.CompletedStatus, result.GetStatus()) +} diff --git a/internal/common/client.go b/internal/common/client.go index 172fe699..61750d67 100644 --- a/internal/common/client.go +++ b/internal/common/client.go @@ -7,19 +7,22 @@ import ( "github.com/google/uuid" ) -// GetClientIdentifier returns a UUID that uniquely identifies this system. -// It uses the machine's hardware ID to generate a consistent, system-specific UUID. -func GetClientIdentifier() uuid.UUID { - +func getMachineDerivedDeviceID() uuid.UUID { // TODO(hugh): Check if the thand.io config exists and use that for an identifier. id, err := machineid.ID() if err != nil { - // Fallback to a random ephemeral UUID if machine ID cannot be obtained + // Fallback to a random ephemeral UUID if machine ID cannot be obtained. return uuid.New() } - // Hash the machine ID and convert to UUID format + // Hash the machine ID and convert to UUID format. hash := sha256.Sum256([]byte(id)) return uuid.UUID(hash[:16]) } + +// GetClientIdentifier returns the stable machine-derived identifier used by +// legacy call sites. Device registration and routing should prefer GetDeviceID. +func GetClientIdentifier() uuid.UUID { + return GetDeviceID() +} diff --git a/internal/common/device_id_default.go b/internal/common/device_id_default.go new file mode 100644 index 00000000..b6858c3b --- /dev/null +++ b/internal/common/device_id_default.go @@ -0,0 +1,10 @@ +//go:build !thand_dev + +package common + +import "github.com/google/uuid" + +// GetDeviceID returns the effective device identity for this machine. +func GetDeviceID() uuid.UUID { + return getMachineDerivedDeviceID() +} diff --git a/internal/common/device_id_dev.go b/internal/common/device_id_dev.go new file mode 100644 index 00000000..ea008b90 --- /dev/null +++ b/internal/common/device_id_dev.go @@ -0,0 +1,25 @@ +//go:build thand_dev + +package common + +import ( + "os" + "strings" + + "github.com/google/uuid" +) + +const deviceIDOverrideEnvVar = "THAND_DEV_DEVICE_ID_OVERRIDE" + +// GetDeviceID returns the effective device identity for this machine. +// Dev-tagged builds may override the machine-derived ID for deterministic tests. +func GetDeviceID() uuid.UUID { + override := strings.TrimSpace(os.Getenv(deviceIDOverrideEnvVar)) + if override != "" { + if parsed, err := uuid.Parse(override); err == nil { + return parsed + } + } + + return getMachineDerivedDeviceID() +} diff --git a/internal/common/device_id_dev_test.go b/internal/common/device_id_dev_test.go new file mode 100644 index 00000000..ce513620 --- /dev/null +++ b/internal/common/device_id_dev_test.go @@ -0,0 +1,22 @@ +//go:build thand_dev + +package common + +import "testing" + +func TestGetDeviceIDHonorsDevOverride(t *testing.T) { + t.Setenv(deviceIDOverrideEnvVar, "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee") + + got := GetDeviceID().String() + want := "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee" + + if got != want { + t.Fatalf("GetDeviceID() = %q, want %q", got, want) + } +} + +func TestGetClientIdentifierMatchesDeviceID(t *testing.T) { + if got, want := GetClientIdentifier(), GetDeviceID(); got != want { + t.Fatalf("GetClientIdentifier() = %q, want %q", got, want) + } +} diff --git a/internal/common/device_id_test.go b/internal/common/device_id_test.go new file mode 100644 index 00000000..775fe08f --- /dev/null +++ b/internal/common/device_id_test.go @@ -0,0 +1,24 @@ +//go:build !thand_dev + +package common + +import ( + "testing" +) + +func TestGetDeviceIDIgnoresDevOverrideInProductionBuild(t *testing.T) { + t.Setenv("THAND_DEV_DEVICE_ID_OVERRIDE", "11111111-2222-3333-4444-555555555555") + + got := GetDeviceID() + want := getMachineDerivedDeviceID() + + if got != want { + t.Fatalf("GetDeviceID() = %q, want machine-derived %q", got, want) + } +} + +func TestGetClientIdentifierMatchesDeviceID(t *testing.T) { + if got, want := GetClientIdentifier(), GetDeviceID(); got != want { + t.Fatalf("GetClientIdentifier() = %q, want %q", got, want) + } +} diff --git a/internal/config/config.go b/internal/config/config.go index e7a63edc..e994d922 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -360,6 +360,7 @@ func (c *Config) ReloadConfig() error { logrus.Infoln("Loaded workflows from external source:", len(workflows)) c.mu.Lock() c.Workflows.Definitions = workflows + c.configGeneration++ c.mu.Unlock() } else { logrus.Warningln("No workflows loaded from external source") @@ -378,6 +379,7 @@ func (c *Config) ReloadConfig() error { logrus.Infoln("Loaded providers from external source:", len(providers)) c.mu.Lock() c.Providers.Definitions = providers + c.configGeneration++ c.mu.Unlock() } else { logrus.Warningln("No providers loaded from external source") @@ -394,6 +396,7 @@ func (c *Config) ReloadConfig() error { logrus.Infoln("Loaded roles from external source:", len(roles)) c.mu.Lock() c.Roles.Definitions = roles + c.configGeneration++ c.mu.Unlock() } else { logrus.Warningln("No roles loaded from external source") @@ -600,7 +603,9 @@ func (c *Config) RegisterWithThandServer() error { }, } - registration, err := c.syncWithEndpoint(thandLoginUrl, authentication) + registration, err := c.syncWithEndpoint(thandLoginUrl, authentication, loginServerRegistrationOptions{ + applyServices: true, + }) if err != nil { return fmt.Errorf("failed to register with thand server: %w", err) @@ -620,17 +625,39 @@ func (c *Config) RegisterWithThandServer() error { } +type loginServerRegistrationOptions struct { + applyServices bool +} + func (c *Config) RegisterWithLoginServer(auth *model.ReferenceableAuthenticationPolicy) (*RegistrationResponse, error) { loginUrl := c.DiscoverLoginServerApiUrl( c.GetLoginServerUrl(), ) - return c.syncWithEndpoint(loginUrl, auth) + return c.syncWithEndpoint(loginUrl, auth, loginServerRegistrationOptions{ + applyServices: true, + }) + +} + +func (c *Config) RefreshLoginServerRegistration(auth *model.ReferenceableAuthenticationPolicy) (*RegistrationResponse, error) { + + loginUrl := c.DiscoverLoginServerApiUrl( + c.GetLoginServerUrl(), + ) + + return c.syncWithEndpoint(loginUrl, auth, loginServerRegistrationOptions{ + applyServices: false, + }) } -func (c *Config) syncWithEndpoint(loginUrl string, authentication *model.ReferenceableAuthenticationPolicy) (*RegistrationResponse, error) { +func (c *Config) syncWithEndpoint( + loginUrl string, + authentication *model.ReferenceableAuthenticationPolicy, + options loginServerRegistrationOptions, +) (*RegistrationResponse, error) { version, commit, _ := common.GetModuleBuildInfo() @@ -638,7 +665,7 @@ func (c *Config) syncWithEndpoint(loginUrl string, authentication *model.Referen Mode: c.GetMode(), Version: version, Commit: commit, - Identifier: common.GetClientIdentifier(), + Identifier: common.GetDeviceID(), Endpoint: c.GetLoginServerUrl(), Origin: c.GetLocalServerUrl(), }) @@ -675,7 +702,7 @@ func (c *Config) syncWithEndpoint(loginUrl string, authentication *model.Referen Environment: &c.Environment, Version: version, Commit: commit, - Identifier: common.GetClientIdentifier(), + Identifier: common.GetDeviceID(), Endpoint: c.GetLoginServerUrl(), Origin: c.GetLocalServerUrl(), }) @@ -754,7 +781,7 @@ func (c *Config) syncWithEndpoint(loginUrl string, authentication *model.Referen } } - if registrationResponse.Services != nil { + if options.applyServices && registrationResponse.Services != nil { // Setup temporal services if provided if registrationResponse.Services.Temporal != nil { diff --git a/internal/config/device_bootstrap.go b/internal/config/device_bootstrap.go new file mode 100644 index 00000000..7056ab75 --- /dev/null +++ b/internal/config/device_bootstrap.go @@ -0,0 +1,207 @@ +package config + +import ( + "context" + "fmt" + "time" + + "github.com/sirupsen/logrus" + "github.com/thand-io/agent/internal/common" + "github.com/thand-io/agent/internal/models" +) + +const ( + deviceBootstrapInitialBackoff = 2 * time.Second + deviceBootstrapMaxBackoff = 1 * time.Minute +) + +func (c *Config) BootstrapDeviceWithLoginServer() error { + if !c.IsAgent() { + return fmt.Errorf("device bootstrap is only valid in agent mode") + } + if !c.HasLoginServer() { + return fmt.Errorf("no login server endpoint configured") + } + + registration, err := c.RegisterWithLoginServer(nil) + if err != nil { + return err + } + + if err := c.applyRegistrationConfiguration(registration); err != nil { + return err + } + + environment := c.GetEnvironmentConfig() + logrus.WithFields(logrus.Fields{ + "device_id": common.GetDeviceID().String(), + "name": environment.Name, + "hostname": environment.Hostname, + "platform": environment.Platform, + "has_config": registration != nil, + }).Info("Bootstrapped agent configuration from login server") + + if err := c.EnsureProviderTemporalBindings(); err != nil { + return fmt.Errorf("ensuring provider temporal bindings: %w", err) + } + + if err := c.PublishCurrentAgentRoute(context.Background()); err != nil { + return fmt.Errorf("publishing current device route: %w", err) + } + + return nil +} + +func (c *Config) RefreshDeviceRegistrationWithLoginServer() error { + if !c.IsAgent() { + return fmt.Errorf("device refresh is only valid in agent mode") + } + if !c.HasLoginServer() { + return fmt.Errorf("no login server endpoint configured") + } + + registration, err := c.RefreshLoginServerRegistration(nil) + if err != nil { + return err + } + + if err := c.applyRegistrationConfiguration(registration); err != nil { + return err + } + + environment := c.GetEnvironmentConfig() + logrus.WithFields(logrus.Fields{ + "device_id": common.GetDeviceID().String(), + "name": environment.Name, + "hostname": environment.Hostname, + "platform": environment.Platform, + }).Debug("Refreshed device registration with login server") + + if err := c.PublishCurrentAgentRoute(context.Background()); err != nil { + return fmt.Errorf("publishing current device route: %w", err) + } + + return nil +} + +func (c *Config) RunDeviceBootstrap(ctx context.Context) { + backoff := deviceBootstrapInitialBackoff + bootstrapped := false + + for { + var err error + if bootstrapped { + err = c.RefreshDeviceRegistrationWithLoginServer() + } else { + err = c.BootstrapDeviceWithLoginServer() + } + if err == nil { + bootstrapped = true + backoff = deviceBootstrapInitialBackoff + + timer := time.NewTimer(deviceRouteRefreshInterval) + select { + case <-ctx.Done(): + timer.Stop() + return + case <-timer.C: + } + continue + } + + logrus.WithError(err).WithField("retry_in", backoff).Warn("device bootstrap failed; retrying") + + timer := time.NewTimer(backoff) + select { + case <-ctx.Done(): + timer.Stop() + return + case <-timer.C: + } + + backoff *= 2 + if backoff > deviceBootstrapMaxBackoff { + backoff = deviceBootstrapMaxBackoff + } + } +} + +func (c *Config) applyRegistrationConfiguration(registration *RegistrationResponse) error { + if registration == nil { + return nil + } + + if registration.Roles == nil && + registration.Workflows == nil && + registration.Providers == nil { + return nil + } + + beforeGeneration := c.getConfigGeneration() + if err := c.MergeConfiguration(registration); err != nil { + return fmt.Errorf("merging registration configuration: %w", err) + } + + if c.getConfigGeneration() == beforeGeneration { + return nil + } + + if err := c.InitializeProviders(); err != nil { + return fmt.Errorf("initializing providers from registration configuration: %w", err) + } + + if !c.IsClient() { + go func() { + if err := c.ReloadRoleIndexes(); err != nil { + logrus.WithError(err).Errorln("Failed to reload role indexes after registration configuration update") + } + }() + } + + return nil +} + +func (c *Config) getConfigGeneration() uint64 { + c.mu.RLock() + defer c.mu.RUnlock() + return c.configGeneration +} + +func (c *Config) PublishCurrentAgentRoute(ctx context.Context) error { + services := c.GetServices() + if services == nil || !services.HasTemporal() { + logrus.Debug("Skipping current device route publication because Temporal is unavailable") + return nil + } + + temporalService := services.GetTemporal() + if temporalService == nil || !temporalService.HasClient() { + logrus.Debug("Skipping current device route publication because the Temporal client is unavailable") + return nil + } + + return c.publishCurrentAgentRoute(ctx, c.PublishDeviceConnectionState) +} + +func (c *Config) publishCurrentAgentRoute( + ctx context.Context, + publish func(context.Context, models.DeviceConnectionState) error, +) error { + if !c.IsAgent() { + return fmt.Errorf("current device route publication is only valid in agent mode") + } + if publish == nil { + return fmt.Errorf("device route publisher is required") + } + + environment := c.GetEnvironmentConfig() + state := models.DeviceConnectionState{ + DeviceID: common.GetDeviceID().String(), + TaskQueue: environment.GetIdentifier(), + Name: c.GetEnvironment().Name, + Hostname: c.GetEnvironment().Hostname, + Platform: string(c.GetEnvironment().Platform), + } + + return publish(ctx, state) +} diff --git a/internal/config/device_bootstrap_test.go b/internal/config/device_bootstrap_test.go new file mode 100644 index 00000000..8c1b332e --- /dev/null +++ b/internal/config/device_bootstrap_test.go @@ -0,0 +1,138 @@ +package config + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/serverlessworkflow/sdk-go/v3/model" + "github.com/thand-io/agent/internal/common" + "github.com/thand-io/agent/internal/models" +) + +func TestBootstrapDeviceWithLoginServerMergesRemoteProviderDefinitions(t *testing.T) { + t.Parallel() + + registerResponse := RegistrationResponse{ + Success: true, + Providers: &ProviderDefinitionsConfig{ + Definitions: map[string]models.ProviderConfig{ + "oauth2-directory": { + Name: "Directory Login", + Description: "Remote OAuth2 provider", + Provider: "oauth2", + Enabled: true, + Config: &models.BasicConfig{ + "client_id": "test-client-id", + "client_secret": "test-client-secret", + "auth_url": "https://auth.example.com/oauth2/auth", + "token_url": "https://auth.example.com/oauth2/token", + "redirect_url": "http://localhost/callback", + }, + }, + }, + }, + } + + loginServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/api/v1/preflight": + var req PreflightRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + t.Fatalf("decoding preflight request: %v", err) + } + if got, want := req.Identifier.String(), common.GetDeviceID().String(); got != want { + t.Fatalf("preflight identifier = %q, want %q", got, want) + } + w.WriteHeader(http.StatusOK) + case "/api/v1/register": + var req RegistrationRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + t.Fatalf("decoding registration request: %v", err) + } + if got, want := req.Identifier.String(), common.GetDeviceID().String(); got != want { + t.Fatalf("registration identifier = %q, want %q", got, want) + } + w.Header().Set("Content-Type", "application/json") + if err := json.NewEncoder(w).Encode(registerResponse); err != nil { + t.Fatalf("encoding registration response: %v", err) + } + case "/api/v1/sync": + w.WriteHeader(http.StatusOK) + default: + http.NotFound(w, r) + } + })) + defer loginServer.Close() + + cfg := &Config{ + mode: ModeAgent, + Login: models.LoginConfig{ + Endpoint: &model.Endpoint{ + EndpointConfig: &model.EndpointConfiguration{ + URI: &model.LiteralUri{Value: loginServer.URL}, + }, + }, + }, + Thand: models.ThandConfig{ + Endpoint: loginServer.URL, + }, + Providers: ProviderDefinitionsConfig{ + Definitions: map[string]models.ProviderConfig{ + "local-elevation": { + Name: "Local Elevation", + Description: "Local privilege elevation provider", + Provider: "local", + Enabled: true, + }, + }, + }, + } + + if err := cfg.InitializeProviders(); err != nil { + t.Fatalf("InitializeProviders() error = %v", err) + } + + if cfg.HasProvider("oauth2-directory") { + t.Fatal("oauth2-directory provider already present before bootstrap") + } + + if err := cfg.BootstrapDeviceWithLoginServer(); err != nil { + t.Fatalf("BootstrapDeviceWithLoginServer() error = %v", err) + } + + if !cfg.HasProvider("oauth2-directory") { + t.Fatal("oauth2-directory provider missing after bootstrap") + } +} + +func TestPublishCurrentAgentRouteUsesCanonicalDeviceIdentity(t *testing.T) { + t.Parallel() + + cfg := &Config{ + mode: ModeAgent, + Environment: models.EnvironmentConfig{ + Name: "workstation-alpha", + Hostname: "workstation-alpha.example.test", + Platform: models.Local, + }, + } + + var published models.DeviceConnectionState + err := cfg.publishCurrentAgentRoute(context.Background(), func(ctx context.Context, state models.DeviceConnectionState) error { + published = state + return nil + }) + if err != nil { + t.Fatalf("publishCurrentAgentRoute() error = %v", err) + } + + if got, want := published.DeviceID, common.GetDeviceID().String(); got != want { + t.Fatalf("DeviceID = %q, want %q", got, want) + } + if got, want := published.TaskQueue, "thand_local_workstation_alpha"; got != want { + t.Fatalf("TaskQueue = %q, want %q", got, want) + } +} diff --git a/internal/config/device_definition_registry.go b/internal/config/device_definition_registry.go new file mode 100644 index 00000000..f2e0532b --- /dev/null +++ b/internal/config/device_definition_registry.go @@ -0,0 +1,62 @@ +package config + +import ( + "fmt" + "strings" + + "github.com/thand-io/agent/internal/models" + "go.temporal.io/sdk/workflow" +) + +func deviceDefinitionRegistryWorkflow(ctx workflow.Context) error { + definitions := map[string]models.Device{} + + if err := workflow.SetQueryHandler(ctx, models.TemporalGetDeviceDefinitionQueryName, func(deviceID string) (*models.Device, error) { + deviceID = strings.TrimSpace(deviceID) + if deviceID == "" { + return nil, fmt.Errorf("device id is required") + } + + device, ok := definitions[deviceID] + if !ok { + return nil, fmt.Errorf("device %q is not configured", deviceID) + } + + deviceCopy := device + return &deviceCopy, nil + }); err != nil { + return err + } + + signalCh := workflow.GetSignalChannel(ctx, models.TemporalDeviceDefinitionUpsertSignalName) + for { + cancelled := false + selector := workflow.NewSelector(ctx) + selector.AddReceive(signalCh, func(c workflow.ReceiveChannel, _ bool) { + var device models.Device + c.Receive(ctx, &device) + + device = normalizeDeviceDefinition(device) + if device.ID == "" { + return + } + + existing, exists := definitions[device.ID] + if exists && !deviceDefinitionsEqual(existing, device) { + workflow.GetLogger(ctx).Warn("Ignoring conflicting device definition update", + "device_id", device.ID, + ) + return + } + + definitions[device.ID] = device + }) + selector.AddReceive(ctx.Done(), func(workflow.ReceiveChannel, bool) { + cancelled = true + }) + selector.Select(ctx) + if cancelled { + return ctx.Err() + } + } +} diff --git a/internal/config/device_definition_registry_test.go b/internal/config/device_definition_registry_test.go new file mode 100644 index 00000000..cd85a998 --- /dev/null +++ b/internal/config/device_definition_registry_test.go @@ -0,0 +1,91 @@ +package config + +import ( + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/thand-io/agent/internal/models" + "go.temporal.io/sdk/testsuite" +) + +func queryDeviceDefinitionEventually( + t *testing.T, + env *testsuite.TestWorkflowEnvironment, + deviceID string, + assertDevice func(models.Device), +) { + t.Helper() + + var poll func() + poll = func() { + value, err := env.QueryWorkflow(models.TemporalGetDeviceDefinitionQueryName, deviceID) + if err != nil && strings.Contains(err.Error(), "unknown queryType") { + env.RegisterDelayedCallback(poll, time.Millisecond) + return + } + require.NoError(t, err) + + var device models.Device + require.NoError(t, value.Get(&device)) + assertDevice(device) + + env.CancelWorkflow() + } + + env.RegisterDelayedCallback(poll, time.Millisecond) +} + +func TestDeviceDefinitionRegistryWorkflowReturnsConfiguredDevice(t *testing.T) { + t.Parallel() + + var suite testsuite.WorkflowTestSuite + env := suite.NewTestWorkflowEnvironment() + + env.RegisterDelayedCallback(func() { + env.SignalWorkflow(models.TemporalDeviceDefinitionUpsertSignalName, models.Device{ + ID: "device-alpha", + Name: "Device Alpha", + Enabled: true, + }) + }, 0) + + queryDeviceDefinitionEventually(t, env, "device-alpha", func(device models.Device) { + assert.Equal(t, "device-alpha", device.ID) + assert.Equal(t, "Device Alpha", device.Name) + }) + + env.ExecuteWorkflow(deviceDefinitionRegistryWorkflow) + require.True(t, env.IsWorkflowCompleted()) + require.Error(t, env.GetWorkflowError()) +} + +func TestDeviceDefinitionRegistryWorkflowRejectsConflictingUpdates(t *testing.T) { + t.Parallel() + + var suite testsuite.WorkflowTestSuite + env := suite.NewTestWorkflowEnvironment() + + env.RegisterDelayedCallback(func() { + env.SignalWorkflow(models.TemporalDeviceDefinitionUpsertSignalName, models.Device{ + ID: "device-alpha", + Name: "Device Alpha", + Enabled: true, + }) + env.SignalWorkflow(models.TemporalDeviceDefinitionUpsertSignalName, models.Device{ + ID: "device-alpha", + Name: "Conflicting Device Alpha", + Enabled: true, + }) + }, 0) + + queryDeviceDefinitionEventually(t, env, "device-alpha", func(device models.Device) { + assert.Equal(t, "Device Alpha", device.Name) + }) + + env.ExecuteWorkflow(deviceDefinitionRegistryWorkflow) + require.True(t, env.IsWorkflowCompleted()) + require.Error(t, env.GetWorkflowError()) +} diff --git a/internal/config/device_local_elevation_test.go b/internal/config/device_local_elevation_test.go new file mode 100644 index 00000000..fc05196c --- /dev/null +++ b/internal/config/device_local_elevation_test.go @@ -0,0 +1,57 @@ +package config + +import ( + "os" + "path/filepath" + "testing" +) + +func TestLoadConfigDecodesDeviceLocalElevationFields(t *testing.T) { + tempDir := t.TempDir() + configPath := filepath.Join(tempDir, "config.yaml") + configBody := []byte(` +version: "1.0" +environment: + name: test + platform: local +devices: + device-alpha: + device_id: "device-alpha" + name: "Example Workstation" + enabled: true + local_elevation: + enabled: true + allowed_modes: + - timed + - command + accounts: + - email: user@example.com + local_username: exampleuser +`) + if err := os.WriteFile(configPath, configBody, 0600); err != nil { + t.Fatalf("failed to write config file: %v", err) + } + + cfg, err := Load(configPath) + if err != nil { + t.Fatalf("Load returned error: %v", err) + } + + device, err := cfg.GetDevice("device-alpha") + if err != nil { + t.Fatalf("GetDevice returned error: %v", err) + } + + if got, want := device.ID, "device-alpha"; got != want { + t.Fatalf("device id = %q, want %q", got, want) + } + if device.LocalElevation == nil { + t.Fatal("expected local_elevation to be decoded") + } + if got, want := len(device.LocalElevation.Accounts), 1; got != want { + t.Fatalf("accounts len = %d, want %d", got, want) + } + if got, want := device.LocalElevation.Accounts[0].LocalUsername, "exampleuser"; got != want { + t.Fatalf("local username = %q, want %q", got, want) + } +} diff --git a/internal/config/device_registry.go b/internal/config/device_registry.go new file mode 100644 index 00000000..1d707dc3 --- /dev/null +++ b/internal/config/device_registry.go @@ -0,0 +1,341 @@ +package config + +import ( + "context" + "errors" + "fmt" + "slices" + "strings" + "time" + + "github.com/sirupsen/logrus" + "github.com/thand-io/agent/internal/models" + "go.temporal.io/api/enums/v1" + "go.temporal.io/api/serviceerror" + workflowservice "go.temporal.io/api/workflowservice/v1" + "go.temporal.io/sdk/client" +) + +const deviceRegistryQueryTimeout = 30 * time.Second + +type deviceRegistryTemporalClient interface { + DescribeWorkflowExecution(ctx context.Context, workflowID, runID string) (*workflowservice.DescribeWorkflowExecutionResponse, error) + QueryWorkflowWithOptions(ctx context.Context, request *client.QueryWorkflowWithOptionsRequest) (*client.QueryWorkflowWithOptionsResponse, error) + SignalWithStartWorkflow( + ctx context.Context, + workflowID string, + signalName string, + signalArg interface{}, + options client.StartWorkflowOptions, + workflow interface{}, + args ...interface{}, + ) (client.WorkflowRun, error) + TerminateWorkflow(ctx context.Context, workflowID, runID, reason string, details ...interface{}) error +} + +func deviceRegistryStartWorkflowOptions(workflowID string) client.StartWorkflowOptions { + // These internal singleton workflows always run on the shared + // device-registry queue, which is intentionally unversioned even when the + // operational server/agent queues use worker deployments. + return client.StartWorkflowOptions{ + ID: workflowID, + TaskQueue: models.TemporalDeviceRegistryTaskQueue, + } +} + +func registryWorkflowUsesVersioning(description *workflowservice.DescribeWorkflowExecutionResponse) bool { + if description == nil { + return false + } + + info := description.GetWorkflowExecutionInfo() + if info == nil { + return false + } + + if strings.TrimSpace(info.GetAssignedBuildId()) != "" || strings.TrimSpace(info.GetInheritedBuildId()) != "" { + return true + } + + versioningInfo := info.GetVersioningInfo() + if versioningInfo == nil { + return false + } + + if versioningInfo.GetBehavior() != enums.VERSIONING_BEHAVIOR_UNSPECIFIED { + return true + } + + return versioningInfo.GetVersioningOverride() != nil +} + +func normalizeDeviceDefinition(device models.Device) models.Device { + device.ID = strings.TrimSpace(device.ID) + device.Name = strings.TrimSpace(device.Name) + device.Description = strings.TrimSpace(device.Description) + device.Platform = strings.TrimSpace(device.Platform) + if device.LocalElevation != nil { + policy := *device.LocalElevation + policy.AllowedModes = slices.Clone(policy.AllowedModes) + policy.DeniedUsernames = slices.Clone(policy.DeniedUsernames) + policy.AllowedUIDRanges = slices.Clone(policy.AllowedUIDRanges) + if len(policy.Accounts) > 0 { + policy.Accounts = append([]models.DeviceLocalElevationAccount(nil), policy.Accounts...) + for i := range policy.Accounts { + policy.Accounts[i].Identity = strings.TrimSpace(policy.Accounts[i].Identity) + policy.Accounts[i].Email = strings.TrimSpace(policy.Accounts[i].Email) + policy.Accounts[i].Username = strings.TrimSpace(policy.Accounts[i].Username) + policy.Accounts[i].LocalUsername = strings.TrimSpace(policy.Accounts[i].LocalUsername) + } + } + for i := range policy.AllowedModes { + policy.AllowedModes[i] = strings.TrimSpace(policy.AllowedModes[i]) + } + for i := range policy.DeniedUsernames { + policy.DeniedUsernames[i] = strings.TrimSpace(policy.DeniedUsernames[i]) + } + for i := range policy.AllowedUIDRanges { + policy.AllowedUIDRanges[i] = strings.TrimSpace(policy.AllowedUIDRanges[i]) + } + device.LocalElevation = &policy + } + return device +} + +func deviceDefinitionsEqual(left, right models.Device) bool { + l := normalizeDeviceDefinition(left) + r := normalizeDeviceDefinition(right) + + if l.ID != r.ID || + l.Name != r.Name || + l.Description != r.Description || + l.Platform != r.Platform || + l.Enabled != r.Enabled { + return false + } + + if (l.LocalElevation == nil) != (r.LocalElevation == nil) { + return false + } + if l.LocalElevation == nil { + return true + } + + ll := l.LocalElevation + rl := r.LocalElevation + if ll.Enabled != rl.Enabled { + return false + } + + if !slices.Equal(ll.AllowedModes, rl.AllowedModes) || + !slices.Equal(ll.DeniedUsernames, rl.DeniedUsernames) || + !slices.Equal(ll.AllowedUIDRanges, rl.AllowedUIDRanges) { + return false + } + + if len(ll.Accounts) != len(rl.Accounts) { + return false + } + for i := range ll.Accounts { + la := ll.Accounts[i] + ra := rl.Accounts[i] + if la.Identity != ra.Identity || + la.Email != ra.Email || + la.Username != ra.Username || + la.LocalUsername != ra.LocalUsername { + return false + } + } + + return true +} + +func queryDeviceDefinition( + ctx context.Context, + temporalClient deviceRegistryTemporalClient, + deviceID string, +) (*models.Device, error) { + deviceID = strings.TrimSpace(deviceID) + if deviceID == "" { + return nil, fmt.Errorf("device id is required") + } + if temporalClient == nil { + return nil, fmt.Errorf("shared device registry is unavailable") + } + + timeoutCtx, cancel := context.WithTimeout(ctx, deviceRegistryQueryTimeout) + defer cancel() + + queryResponse, err := temporalClient.QueryWorkflowWithOptions(timeoutCtx, &client.QueryWorkflowWithOptionsRequest{ + WorkflowID: models.TemporalDeviceDefinitionRegistryWorkflowID, + RunID: "", + QueryType: models.TemporalGetDeviceDefinitionQueryName, + QueryRejectCondition: enums.QUERY_REJECT_CONDITION_NOT_OPEN, + Args: []any{deviceID}, + }) + if err != nil { + return nil, fmt.Errorf("device %q is not configured", deviceID) + } + if queryResponse == nil || queryResponse.QueryResult == nil { + return nil, fmt.Errorf("device %q is not configured", deviceID) + } + + var device models.Device + if err := queryResponse.QueryResult.Get(&device); err != nil { + return nil, err + } + + normalized := normalizeDeviceDefinition(device) + return &normalized, nil +} + +func ensureRegistryWorkflowTaskQueue( + ctx context.Context, + temporalClient deviceRegistryTemporalClient, + workflowID string, +) error { + if temporalClient == nil { + return fmt.Errorf("temporal client is required to manage registry workflow %q", workflowID) + } + + description, err := temporalClient.DescribeWorkflowExecution(ctx, workflowID, "") + if err != nil { + var notFound *serviceerror.NotFound + if errors.As(err, ¬Found) { + return nil + } + return err + } + + taskQueue := "" + if description != nil && description.ExecutionConfig != nil && description.ExecutionConfig.TaskQueue != nil { + taskQueue = strings.TrimSpace(description.ExecutionConfig.TaskQueue.Name) + } + if taskQueue == "" || taskQueue == models.TemporalDeviceRegistryTaskQueue { + if !registryWorkflowUsesVersioning(description) { + return nil + } + + logrus.WithFields(logrus.Fields{ + "workflow_id": workflowID, + "task_queue": models.TemporalDeviceRegistryTaskQueue, + }).Warn("Recreating versioned device registry workflow on the canonical unversioned device registry queue") + + return temporalClient.TerminateWorkflow(ctx, workflowID, "", "migrating device registry workflow to canonical unversioned queue") + } + + logrus.WithFields(logrus.Fields{ + "workflow_id": workflowID, + "task_queue": taskQueue, + "expected": models.TemporalDeviceRegistryTaskQueue, + }).Warn("Recreating device registry workflow on the canonical device registry queue") + + return temporalClient.TerminateWorkflow(ctx, workflowID, "", "migrating device registry workflow to canonical task queue") +} + +func publishDeviceDefinition( + ctx context.Context, + temporalClient deviceRegistryTemporalClient, + device models.Device, +) error { + if temporalClient == nil { + return fmt.Errorf("shared device registry is unavailable") + } + + device = normalizeDeviceDefinition(device) + if device.ID == "" { + return nil + } + + _, err := temporalClient.SignalWithStartWorkflow( + ctx, + models.TemporalDeviceDefinitionRegistryWorkflowID, + models.TemporalDeviceDefinitionUpsertSignalName, + device, + deviceRegistryStartWorkflowOptions(models.TemporalDeviceDefinitionRegistryWorkflowID), + models.TemporalDeviceDefinitionRegistryWorkflowName, + ) + return err +} + +func (c *Config) querySharedDeviceDefinition(ctx context.Context, deviceID string) (*models.Device, error) { + services := c.GetServices() + if services == nil || !services.HasTemporal() { + return nil, fmt.Errorf("shared device registry is unavailable") + } + + temporalService := services.GetTemporal() + if temporalService == nil || !temporalService.HasClient() { + return nil, fmt.Errorf("shared device registry is unavailable") + } + + return queryDeviceDefinition(ctx, temporalService.GetClient(), deviceID) +} + +func (c *Config) EnsureDeviceRegistryWorkflows(ctx context.Context) error { + if !c.IsServer() { + return nil + } + + services := c.GetServices() + if services == nil || !services.HasTemporal() { + return fmt.Errorf("temporal service is required to manage device registries") + } + + temporalService := services.GetTemporal() + if temporalService == nil || !temporalService.HasClient() { + return fmt.Errorf("temporal client is required to manage device registries") + } + + for _, workflowID := range []string{ + models.TemporalDeviceRouteRegistryWorkflowID, + models.TemporalDeviceDefinitionRegistryWorkflowID, + } { + if err := ensureRegistryWorkflowTaskQueue(ctx, temporalService.GetClient(), workflowID); err != nil { + return err + } + } + + return nil +} + +func (c *Config) PublishConfiguredDeviceDefinitions(ctx context.Context) error { + if !c.IsServer() { + return nil + } + + services := c.GetServices() + if services == nil || !services.HasTemporal() { + return fmt.Errorf("temporal service is required to publish device definitions") + } + + temporalService := services.GetTemporal() + if temporalService == nil || !temporalService.HasClient() { + return fmt.Errorf("temporal client is required to publish device definitions") + } + + deviceIDs := make([]string, 0, len(c.Devices.Definitions)) + for _, device := range c.Devices.Definitions { + deviceID := strings.TrimSpace(device.ID) + if deviceID == "" { + continue + } + deviceIDs = append(deviceIDs, deviceID) + } + slices.Sort(deviceIDs) + + for _, deviceID := range deviceIDs { + device, err := c.GetDevice(deviceID) + if err != nil { + return err + } + if device == nil { + continue + } + if err := publishDeviceDefinition(ctx, temporalService.GetClient(), *device); err != nil { + return err + } + } + + return nil +} diff --git a/internal/config/device_registry_test.go b/internal/config/device_registry_test.go new file mode 100644 index 00000000..d1b7bf13 --- /dev/null +++ b/internal/config/device_registry_test.go @@ -0,0 +1,139 @@ +package config + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/thand-io/agent/internal/models" + enumspb "go.temporal.io/api/enums/v1" + taskqueuepb "go.temporal.io/api/taskqueue/v1" + workflowpb "go.temporal.io/api/workflow/v1" + workflowservice "go.temporal.io/api/workflowservice/v1" + "go.temporal.io/sdk/client" + "go.temporal.io/sdk/converter" +) + +type fakeDeviceRegistryClient struct { + describeResponse *workflowservice.DescribeWorkflowExecutionResponse + describeErr error + queryResponse *client.QueryWorkflowWithOptionsResponse + queryErr error + terminated []string + signalOptions []client.StartWorkflowOptions + signalNames []string + signalArgs []any +} + +func (f *fakeDeviceRegistryClient) DescribeWorkflowExecution(ctx context.Context, workflowID, runID string) (*workflowservice.DescribeWorkflowExecutionResponse, error) { + return f.describeResponse, f.describeErr +} + +func (f *fakeDeviceRegistryClient) QueryWorkflowWithOptions(ctx context.Context, request *client.QueryWorkflowWithOptionsRequest) (*client.QueryWorkflowWithOptionsResponse, error) { + return f.queryResponse, f.queryErr +} + +func (f *fakeDeviceRegistryClient) SignalWithStartWorkflow( + ctx context.Context, + workflowID string, + signalName string, + signalArg interface{}, + options client.StartWorkflowOptions, + workflow interface{}, + args ...interface{}, +) (client.WorkflowRun, error) { + f.signalOptions = append(f.signalOptions, options) + f.signalNames = append(f.signalNames, signalName) + f.signalArgs = append(f.signalArgs, signalArg) + return nil, nil +} + +func (f *fakeDeviceRegistryClient) TerminateWorkflow(ctx context.Context, workflowID, runID, reason string, details ...interface{}) error { + f.terminated = append(f.terminated, workflowID) + return nil +} + +func TestEnsureRegistryWorkflowTaskQueueTerminatesWrongQueue(t *testing.T) { + t.Parallel() + + client := &fakeDeviceRegistryClient{ + describeResponse: &workflowservice.DescribeWorkflowExecutionResponse{ + ExecutionConfig: &workflowpb.WorkflowExecutionConfig{ + TaskQueue: &taskqueuepb.TaskQueue{Name: "thand_local_old_server"}, + }, + }, + } + + err := ensureRegistryWorkflowTaskQueue(context.Background(), client, models.TemporalDeviceRouteRegistryWorkflowID) + require.NoError(t, err) + assert.Equal(t, []string{models.TemporalDeviceRouteRegistryWorkflowID}, client.terminated) +} + +func TestEnsureRegistryWorkflowTaskQueueTerminatesVersionedRegistryWorkflow(t *testing.T) { + t.Parallel() + + client := &fakeDeviceRegistryClient{ + describeResponse: &workflowservice.DescribeWorkflowExecutionResponse{ + ExecutionConfig: &workflowpb.WorkflowExecutionConfig{ + TaskQueue: &taskqueuepb.TaskQueue{Name: models.TemporalDeviceRegistryTaskQueue}, + }, + WorkflowExecutionInfo: &workflowpb.WorkflowExecutionInfo{ + VersioningInfo: &workflowpb.WorkflowExecutionVersioningInfo{ + Behavior: enumspb.VERSIONING_BEHAVIOR_AUTO_UPGRADE, + }, + }, + }, + } + + err := ensureRegistryWorkflowTaskQueue(context.Background(), client, models.TemporalDeviceDefinitionRegistryWorkflowID) + require.NoError(t, err) + assert.Equal(t, []string{models.TemporalDeviceDefinitionRegistryWorkflowID}, client.terminated) +} + +func TestPublishDeviceDefinitionUsesCanonicalRegistryQueue(t *testing.T) { + t.Parallel() + + fakeClient := &fakeDeviceRegistryClient{} + err := publishDeviceDefinition(context.Background(), fakeClient, models.Device{ + ID: "device-alpha", + Name: "Device Alpha", + Enabled: true, + }) + require.NoError(t, err) + require.Len(t, fakeClient.signalOptions, 1) + assert.Equal(t, models.TemporalDeviceRegistryTaskQueue, fakeClient.signalOptions[0].TaskQueue) + assert.Equal(t, models.TemporalDeviceDefinitionUpsertSignalName, fakeClient.signalNames[0]) + assert.Nil(t, fakeClient.signalOptions[0].VersioningOverride) +} + +func TestDeviceRegistryStartWorkflowOptionsOmitsVersioningOverride(t *testing.T) { + t.Parallel() + + opts := deviceRegistryStartWorkflowOptions(models.TemporalDeviceDefinitionRegistryWorkflowID) + assert.Equal(t, models.TemporalDeviceRegistryTaskQueue, opts.TaskQueue) + assert.Nil(t, opts.VersioningOverride) +} + +func TestQueryDeviceDefinitionReturnsStoredDevice(t *testing.T) { + t.Parallel() + + payloads, err := converter.GetDefaultDataConverter().ToPayloads(models.Device{ + ID: "device-alpha", + Name: "Device Alpha", + Enabled: true, + }) + require.NoError(t, err) + + client := &fakeDeviceRegistryClient{ + queryResponse: &client.QueryWorkflowWithOptionsResponse{ + QueryResult: client.NewValue(payloads), + }, + } + + device, err := queryDeviceDefinition(context.Background(), client, "device-alpha") + require.NoError(t, err) + require.NotNil(t, device) + assert.Equal(t, "device-alpha", device.ID) + assert.Equal(t, "Device Alpha", device.Name) +} diff --git a/internal/config/device_route_registry.go b/internal/config/device_route_registry.go new file mode 100644 index 00000000..1e9afb52 --- /dev/null +++ b/internal/config/device_route_registry.go @@ -0,0 +1,101 @@ +package config + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/thand-io/agent/internal/models" + "go.temporal.io/sdk/workflow" +) + +const deviceRouteRegistryTickInterval = 30 * time.Second + +func deviceRouteRegistryWorkflow(ctx workflow.Context) error { + routes := map[string]models.DeviceConnectionState{} + + if err := workflow.SetQueryHandler(ctx, models.TemporalGetDeviceRouteQueryName, func(deviceID string) (*models.DeviceConnectionState, error) { + deviceID = strings.TrimSpace(deviceID) + if deviceID == "" { + return nil, fmt.Errorf("device id is required") + } + + route, ok := routes[deviceID] + if !ok { + return nil, fmt.Errorf("%w: device %q is not connected", ErrDeviceRouteUnavailable, deviceID) + } + + route.Connected = route.TaskQueue != "" && !route.LastSeenAt.IsZero() && workflow.Now(ctx).Sub(route.LastSeenAt) <= models.DeviceRouteFreshnessTTL + if !route.Connected { + return nil, fmt.Errorf("%w: device %q is not connected", ErrDeviceRouteUnavailable, deviceID) + } + + routeCopy := route + return &routeCopy, nil + }); err != nil { + return err + } + + signalCh := workflow.GetSignalChannel(ctx, models.TemporalDeviceRouteUpsertSignalName) + for { + cancelled := false + selector := workflow.NewSelector(ctx) + selector.AddReceive(signalCh, func(c workflow.ReceiveChannel, _ bool) { + var route models.DeviceConnectionState + c.Receive(ctx, &route) + route.DeviceID = strings.TrimSpace(route.DeviceID) + if route.DeviceID == "" { + return + } + route.TaskQueue = strings.TrimSpace(route.TaskQueue) + if route.LastSeenAt.IsZero() { + route.LastSeenAt = workflow.Now(ctx) + } + route.Connected = route.TaskQueue != "" && workflow.Now(ctx).Sub(route.LastSeenAt) <= models.DeviceRouteFreshnessTTL + routes[route.DeviceID] = route + }) + selector.AddReceive(ctx.Done(), func(workflow.ReceiveChannel, bool) { + cancelled = true + }) + selector.AddFuture(workflow.NewTimer(ctx, deviceRouteRegistryTickInterval), func(workflow.Future) {}) + selector.Select(ctx) + if cancelled { + return ctx.Err() + } + } +} + +func (c *Config) PublishDeviceConnectionState(ctx context.Context, state models.DeviceConnectionState) error { + c.SetDeviceConnectionState(state) + + services := c.GetServices() + if services == nil || !services.HasTemporal() { + return nil + } + + temporalService := services.GetTemporal() + if temporalService == nil || !temporalService.HasClient() { + return nil + } + + state.DeviceID = strings.TrimSpace(state.DeviceID) + state.TaskQueue = strings.TrimSpace(state.TaskQueue) + if state.DeviceID == "" || state.TaskQueue == "" { + return nil + } + if state.LastSeenAt.IsZero() { + state.LastSeenAt = time.Now().UTC() + } + state.Connected = true + + _, err := temporalService.GetClient().SignalWithStartWorkflow( + ctx, + models.TemporalDeviceRouteRegistryWorkflowID, + models.TemporalDeviceRouteUpsertSignalName, + state, + deviceRegistryStartWorkflowOptions(models.TemporalDeviceRouteRegistryWorkflowID), + models.TemporalDeviceRouteRegistryWorkflowName, + ) + return err +} diff --git a/internal/config/device_route_registry_test.go b/internal/config/device_route_registry_test.go new file mode 100644 index 00000000..b038ab1f --- /dev/null +++ b/internal/config/device_route_registry_test.go @@ -0,0 +1,76 @@ +package config + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/thand-io/agent/internal/models" + "go.temporal.io/sdk/testsuite" +) + +func TestDeviceRouteRegistryWorkflowReturnsFreshRouteByDeviceID(t *testing.T) { + var suite testsuite.WorkflowTestSuite + env := suite.NewTestWorkflowEnvironment() + + env.RegisterDelayedCallback(func() { + env.SignalWorkflow(models.TemporalDeviceRouteUpsertSignalName, models.DeviceConnectionState{ + DeviceID: "device-alpha", + TaskQueue: "thand-local-alpha", + Name: "Device Alpha", + Hostname: "host-one", + Platform: "local", + }) + }, time.Second) + + env.RegisterDelayedCallback(func() { + value, err := env.QueryWorkflow(models.TemporalGetDeviceRouteQueryName, "device-alpha") + require.NoError(t, err) + + var route models.DeviceConnectionState + require.NoError(t, value.Get(&route)) + assert.Equal(t, "thand-local-alpha", route.TaskQueue) + assert.Equal(t, "host-one", route.Hostname) + + env.CancelWorkflow() + }, 2*time.Second) + + env.ExecuteWorkflow(deviceRouteRegistryWorkflow) + require.True(t, env.IsWorkflowCompleted()) + require.Error(t, env.GetWorkflowError()) +} + +func TestDeviceRouteRegistryWorkflowUsesHostnameAsMetadataOnly(t *testing.T) { + var suite testsuite.WorkflowTestSuite + env := suite.NewTestWorkflowEnvironment() + + env.RegisterDelayedCallback(func() { + env.SignalWorkflow(models.TemporalDeviceRouteUpsertSignalName, models.DeviceConnectionState{ + DeviceID: "device-alpha", + TaskQueue: "thand-local-alpha", + Hostname: "host-one", + }) + env.SignalWorkflow(models.TemporalDeviceRouteUpsertSignalName, models.DeviceConnectionState{ + DeviceID: "device-alpha", + TaskQueue: "thand-local-alpha", + Hostname: "host-two", + }) + }, time.Second) + + env.RegisterDelayedCallback(func() { + value, err := env.QueryWorkflow(models.TemporalGetDeviceRouteQueryName, "device-alpha") + require.NoError(t, err) + + var route models.DeviceConnectionState + require.NoError(t, value.Get(&route)) + assert.Equal(t, "thand-local-alpha", route.TaskQueue) + assert.Equal(t, "host-two", route.Hostname) + + env.CancelWorkflow() + }, 2*time.Second) + + env.ExecuteWorkflow(deviceRouteRegistryWorkflow) + require.True(t, env.IsWorkflowCompleted()) + require.Error(t, env.GetWorkflowError()) +} diff --git a/internal/config/devices.go b/internal/config/devices.go new file mode 100644 index 00000000..485e3a61 --- /dev/null +++ b/internal/config/devices.go @@ -0,0 +1,111 @@ +package config + +import ( + "errors" + "fmt" + "strings" + "time" + + "github.com/thand-io/agent/internal/models" +) + +var ErrDeviceRouteUnavailable = errors.New("device route unavailable") + +const ( + deviceRouteRefreshInterval = models.DeviceRouteRefreshInterval + deviceRouteFreshnessTTL = models.DeviceRouteFreshnessTTL +) + +func (c *Config) GetDevice(deviceID string) (*models.Device, error) { + deviceID = strings.TrimSpace(deviceID) + if deviceID == "" { + return nil, fmt.Errorf("device id is required") + } + + for _, device := range c.Devices.Definitions { + configuredDeviceID := strings.TrimSpace(device.ID) + if configuredDeviceID == "" { + continue + } + if strings.EqualFold(configuredDeviceID, deviceID) { + deviceCopy := device + return &deviceCopy, nil + } + } + + return nil, fmt.Errorf("device %q is not configured", deviceID) +} + +func (c *Config) SetDeviceConnectionState(state models.DeviceConnectionState) { + if strings.TrimSpace(state.DeviceID) == "" { + return + } + + state.DeviceID = strings.TrimSpace(state.DeviceID) + state.TaskQueue = strings.TrimSpace(state.TaskQueue) + if state.LastSeenAt.IsZero() { + state.LastSeenAt = time.Now().UTC() + } + state.Connected = c.isFreshDeviceConnectionState(&state) + + c.deviceConnectionsMu.Lock() + defer c.deviceConnectionsMu.Unlock() + + if c.deviceConnections == nil { + c.deviceConnections = make(map[string]*models.DeviceConnectionState) + } + + stateCopy := state + c.deviceConnections[state.DeviceID] = &stateCopy +} + +func (c *Config) GetDeviceConnectionState(deviceID string) *models.DeviceConnectionState { + deviceID = strings.TrimSpace(deviceID) + if deviceID == "" { + return nil + } + + c.deviceConnectionsMu.RLock() + defer c.deviceConnectionsMu.RUnlock() + + state, ok := c.deviceConnections[deviceID] + if !ok || state == nil { + return nil + } + + stateCopy := *state + stateCopy.Connected = c.isFreshDeviceConnectionState(&stateCopy) + return &stateCopy +} + +func (c *Config) GetFreshDeviceRoute(deviceID string) (*models.DeviceConnectionState, error) { + device, err := c.GetDevice(deviceID) + if err != nil { + return nil, err + } + if !device.Enabled { + return nil, fmt.Errorf("%w: device %q is disabled", ErrDeviceRouteUnavailable, device.ID) + } + connectionState := c.GetDeviceConnectionState(device.ID) + if connectionState == nil || !connectionState.Connected { + return nil, fmt.Errorf("%w: device %q is not connected", ErrDeviceRouteUnavailable, device.ID) + } + if strings.TrimSpace(connectionState.TaskQueue) == "" { + return nil, fmt.Errorf("%w: device %q has no live task queue", ErrDeviceRouteUnavailable, device.ID) + } + + return connectionState, nil +} + +func (c *Config) isFreshDeviceConnectionState(state *models.DeviceConnectionState) bool { + if state == nil { + return false + } + if strings.TrimSpace(state.TaskQueue) == "" { + return false + } + if state.LastSeenAt.IsZero() { + return false + } + return time.Since(state.LastSeenAt) <= deviceRouteFreshnessTTL +} diff --git a/internal/config/devices_test.go b/internal/config/devices_test.go new file mode 100644 index 00000000..09c5889c --- /dev/null +++ b/internal/config/devices_test.go @@ -0,0 +1,117 @@ +package config + +import ( + "strings" + "testing" + "time" + + "github.com/thand-io/agent/internal/models" +) + +func TestGetFreshDeviceRouteUsesConnectedTaskQueue(t *testing.T) { + cfg := &Config{ + Devices: DeviceDefinitionsConfig{ + Definitions: map[string]models.Device{ + "device-alpha": { + ID: "device-alpha", + Name: "Device Alpha", + Enabled: true, + }, + }, + }, + } + + cfg.SetDeviceConnectionState(models.DeviceConnectionState{ + DeviceID: "device-alpha", + TaskQueue: "connected-queue", + }) + + route, err := cfg.GetFreshDeviceRoute("device-alpha") + if err != nil { + t.Fatalf("GetFreshDeviceRoute returned error: %v", err) + } + + if got, want := route.TaskQueue, "connected-queue"; got != want { + t.Fatalf("task queue = %q, want %q", got, want) + } + if !route.Connected { + t.Fatal("expected fresh route to be marked connected") + } +} + +func TestGetFreshDeviceRouteRejectsStaleConnection(t *testing.T) { + cfg := &Config{ + Devices: DeviceDefinitionsConfig{ + Definitions: map[string]models.Device{ + "device-alpha": { + ID: "device-alpha", + Name: "Device Alpha", + Enabled: true, + }, + }, + }, + } + + cfg.SetDeviceConnectionState(models.DeviceConnectionState{ + DeviceID: "device-alpha", + TaskQueue: "connected-queue", + LastSeenAt: time.Now().UTC().Add(-deviceRouteFreshnessTTL - time.Second), + }) + + _, err := cfg.GetFreshDeviceRoute("device-alpha") + if err == nil { + t.Fatal("expected stale route error") + } + if !strings.Contains(err.Error(), `device "device-alpha" is not connected`) { + t.Fatalf("unexpected error: %v", err) + } + + state := cfg.GetDeviceConnectionState("device-alpha") + if state == nil { + t.Fatal("expected stored connection state") + } + if state.Connected { + t.Fatal("expected stale connection state to be marked disconnected") + } +} + +func TestGetDeviceUsesCanonicalDeviceID(t *testing.T) { + cfg := &Config{ + Devices: DeviceDefinitionsConfig{ + Definitions: map[string]models.Device{ + "workstation-alpha": { + ID: "device-alpha", + Name: "Device Alpha", + Enabled: true, + }, + }, + }, + } + + device, err := cfg.GetDevice("device-alpha") + if err != nil { + t.Fatalf("GetDevice returned error: %v", err) + } + + if got, want := device.ID, "device-alpha"; got != want { + t.Fatalf("device id = %q, want %q", got, want) + } +} + +func TestGetDeviceDoesNotTreatMapKeyAsIdentity(t *testing.T) { + cfg := &Config{ + Devices: DeviceDefinitionsConfig{ + Definitions: map[string]models.Device{ + "workstation-alpha": { + ID: "device-alpha", + Name: "Device Alpha", + Enabled: true, + }, + }, + }, + } + + if _, err := cfg.GetDevice("workstation-alpha"); err == nil { + t.Fatal("expected GetDevice to reject YAML map key as device identity") + } +} diff --git a/internal/config/environment/gcp/workflows.yaml b/internal/config/environment/gcp/workflows.yaml index e7ff594f..c2f037bd 100644 --- a/internal/config/environment/gcp/workflows.yaml +++ b/internal/config/environment/gcp/workflows.yaml @@ -31,4 +31,4 @@ workflows: - revoke: thand: revoke then: end - \ No newline at end of file + diff --git a/internal/config/environment/kubernetes/workflows.yaml b/internal/config/environment/kubernetes/workflows.yaml index 470c5e89..5fbe290c 100644 --- a/internal/config/environment/kubernetes/workflows.yaml +++ b/internal/config/environment/kubernetes/workflows.yaml @@ -31,4 +31,4 @@ workflows: - revoke: thand: revoke then: end - \ No newline at end of file + diff --git a/internal/config/environment/local/providers.yaml b/internal/config/environment/local/providers.yaml index a3e41475..6cb61d08 100644 --- a/internal/config/environment/local/providers.yaml +++ b/internal/config/environment/local/providers.yaml @@ -12,3 +12,15 @@ providers: description: Local privilege elevation (like sudo or UAC) provider: local enabled: true + + local-presence: + name: Local Presence + description: Local macOS device-owner approval prompts + provider: local-presence + enabled: true + + local-notification: + name: Local Notification + description: Local macOS user notifications + provider: local-notification + enabled: true diff --git a/internal/config/environment/local/roles.yaml b/internal/config/environment/local/roles.yaml index a4395b88..9c6d62c1 100644 --- a/internal/config/environment/local/roles.yaml +++ b/internal/config/environment/local/roles.yaml @@ -19,6 +19,7 @@ roles: - path:/root # Only allow access to /root directory providers: - local + - local-elevation enabled: true # Power Users Group @@ -41,6 +42,7 @@ roles: - path:/opt providers: - local + - local-elevation enabled: true # Users Group @@ -62,6 +64,20 @@ roles: - path:/var/tmp providers: - local + - local-elevation + enabled: true + + local_sudo: + name: Local Sudo + description: Time-bound local sudo access + workflows: + - local_sudo_timed_elevation + permissions: + allow: + - local:sudo:* + providers: + - local + - local-elevation enabled: true # Operators Group @@ -84,5 +100,5 @@ roles: - path:/etc/systemd providers: - local + - local-elevation enabled: true - diff --git a/internal/config/environment/local/workflows.yaml b/internal/config/environment/local/workflows.yaml index e00b2eb0..07c4280d 100644 --- a/internal/config/environment/local/workflows.yaml +++ b/internal/config/environment/local/workflows.yaml @@ -32,3 +32,34 @@ workflows: thand: revoke then: end + local_sudo_timed_elevation: + name: "Local Sudo Timed Elevation" + description: Time-bound local sudo access on a server-managed device + authentication: default + enabled: true + workflow: + document: + dsl: "1.0.0-alpha5" + namespace: "thand" + name: "local-sudo-timed-elevation" + version: "1.0.0" + do: + - validate: + thand: validate + with: + validator: static + then: authorize + - authorize: + thand: authorize + with: + revocation: revoke + then: monitor + - monitor: + thand: monitor + with: + monitor: basic + threshold: 100 + then: revoke + - revoke: + thand: revoke + then: end diff --git a/internal/config/execution_plan.go b/internal/config/execution_plan.go new file mode 100644 index 00000000..3e84daa8 --- /dev/null +++ b/internal/config/execution_plan.go @@ -0,0 +1,195 @@ +package config + +import ( + "fmt" + "strings" + + "github.com/thand-io/agent/internal/models" +) + +type executionPlanBuildOptions struct { + LookupDeviceDefinition func(deviceID string) (*models.Device, error) + Decorators []executionPlanDecorator +} + +// executionPlanDecorator lets device-local or provider-specific request shaping +// stay close to the feature that needs it instead of branching inside the +// Temporal activity that drives planning. +type executionPlanDecorator interface { + Applies(elevateRequest *models.ElevateRequestInternal) bool + // Decorate runs before EntryID creation and provider request materialization. + // Use it to populate request metadata and routing fields that should + // contribute to the stable execution-plan entry identity. + Decorate( + cfg models.ConfigImpl, + req *models.WorkflowRoleRequest, + elevateRequest *models.ElevateRequestInternal, + opts executionPlanBuildOptions, + ) error + // Finalize runs after EntryID creation. Use it for metadata that must depend + // on the stable entry identity itself, such as broker grant IDs, without + // feeding that generated value back into the EntryID calculation. + Finalize( + req *models.WorkflowRoleRequest, + elevateRequest *models.ElevateRequestInternal, + entryID string, + ) error +} + +func BuildExecutionPlan( + cfg models.ConfigImpl, + workflowID string, + elevateRequest *models.ElevateRequestInternal, +) (*models.ExecutionPlan, error) { + return BuildExecutionPlanWithOptions(cfg, workflowID, elevateRequest, executionPlanBuildOptions{}) +} + +func BuildExecutionPlanWithOptions( + cfg models.ConfigImpl, + workflowID string, + elevateRequest *models.ElevateRequestInternal, + opts executionPlanBuildOptions, +) (*models.ExecutionPlan, error) { + if elevateRequest == nil { + return nil, fmt.Errorf("elevate request is required for execution planning") + } + if len(elevateRequest.Providers) == 0 { + return nil, fmt.Errorf("no providers specified for authorization") + } + if len(elevateRequest.Identities) == 0 { + return nil, fmt.Errorf("no identities specified for authorization") + } + + opts = opts.withDefaults(cfg) + + duration, err := elevateRequest.AsDuration() + if err != nil { + return nil, fmt.Errorf("failed to get duration: %w", err) + } + + workflowName := strings.TrimSpace(elevateRequest.GetWorkflow()) + if workflowName == "" { + return nil, fmt.Errorf("workflow name is required for execution planning") + } + + tenants := elevateRequest.Tenants + if len(tenants) == 0 { + tenants = []string{""} + } + + plan := &models.ExecutionPlan{WorkflowName: workflowName} + + for _, providerName := range elevateRequest.Providers { + providerName = strings.TrimSpace(providerName) + if providerName == "" { + return nil, fmt.Errorf("execution plan entry is missing provider name") + } + + provider, err := cfg.GetProviderByName(providerName) + if err != nil { + return nil, fmt.Errorf("failed to get provider %q: %w", providerName, err) + } + + for _, identityID := range elevateRequest.Identities { + resolvedIdentity := resolveIdentitySnapshot(cfg, identityID) + + for _, tenantID := range tenants { + workflowReq := &models.WorkflowRoleRequest{ + WorkflowID: workflowID, + Identity: identityID, + ResolvedIdentity: resolvedIdentity, + Role: elevateRequest.Role, + Duration: &duration, + Tenant: tenantID, + } + + if err := applyExecutionPlanDecorators(cfg, workflowReq, elevateRequest, opts); err != nil { + return nil, err + } + + entryID := models.CreateExecutionPlanEntryID(workflowID, providerName, workflowReq) + if err := finalizeExecutionPlanDecorators(workflowReq, elevateRequest, entryID, opts); err != nil { + return nil, err + } + + authorizeRequest, err := models.CreateAuthorizeRoleRequest(cfg, provider, workflowReq) + if err != nil { + return nil, fmt.Errorf("failed to create authorize role request for provider %q and identity %q: %w", providerName, identityID, err) + } + + plan.Entries = append(plan.Entries, models.ExecutionPlanEntry{ + EntryID: entryID, + ProviderName: providerName, + DeviceID: workflowReq.DeviceID, + AuthorizeRequest: authorizeRequest, + }) + } + } + } + + if !plan.IsValid() { + return nil, fmt.Errorf("execution plan did not contain any entries") + } + + return plan, nil +} + +func (opts executionPlanBuildOptions) withDefaults(cfg models.ConfigImpl) executionPlanBuildOptions { + if opts.LookupDeviceDefinition == nil { + opts.LookupDeviceDefinition = cfg.GetDevice + } + if opts.Decorators == nil { + opts.Decorators = []executionPlanDecorator{ + localSudoExecutionPlanDecorator{}, + } + } + return opts +} + +func applyExecutionPlanDecorators( + cfg models.ConfigImpl, + req *models.WorkflowRoleRequest, + elevateRequest *models.ElevateRequestInternal, + opts executionPlanBuildOptions, +) error { + if req == nil { + return fmt.Errorf("workflow role request is required for execution planning") + } + + for _, decorator := range opts.Decorators { + if decorator == nil || !decorator.Applies(elevateRequest) { + continue + } + if err := decorator.Decorate(cfg, req, elevateRequest, opts); err != nil { + return err + } + } + + return nil +} + +func finalizeExecutionPlanDecorators( + req *models.WorkflowRoleRequest, + elevateRequest *models.ElevateRequestInternal, + entryID string, + opts executionPlanBuildOptions, +) error { + for _, decorator := range opts.Decorators { + if decorator == nil || !decorator.Applies(elevateRequest) { + continue + } + if err := decorator.Finalize(req, elevateRequest, entryID); err != nil { + return err + } + } + + return nil +} + +func resolveIdentitySnapshot(cfg models.ConfigImpl, identityID string) *models.Identity { + identityResult, err := cfg.GetIdentity(identityID) + if err != nil || identityResult == nil || identityResult.User == nil { + return nil + } + return identityResult +} diff --git a/internal/config/execution_plan_activity_test.go b/internal/config/execution_plan_activity_test.go new file mode 100644 index 00000000..ac7662ea --- /dev/null +++ b/internal/config/execution_plan_activity_test.go @@ -0,0 +1,137 @@ +package config + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/thand-io/agent/internal/models" +) + +type executionPlanActivityTestProvider struct { + *models.BaseProvider +} + +func (p *executionPlanActivityTestProvider) ValidateRole( + ctx context.Context, + user *models.Identity, + role *models.Role, +) (map[string]any, error) { + return map[string]any{}, nil +} + +func newExecutionPlanActivityTestProvider(identifier string) *executionPlanActivityTestProvider { + caps := models.NewProviderCapabilities().WithDefaultProvisioningConfiguration() + providerCfg := models.ProviderConfig{ + Name: identifier, + Provider: identifier, + Enabled: true, + Capabilities: caps, + Config: &models.BasicConfig{}, + } + + provider := &executionPlanActivityTestProvider{ + BaseProvider: models.NewBaseProvider(identifier, providerCfg, caps), + } + provider.SetReady() + return provider +} + +func newExecutionPlanActivityRole(identifier, name string) *models.Role { + return &models.Role{ + Identifier: identifier, + Name: name, + Enabled: true, + Permissions: models.RolePermissions{ + Allow: models.RoleStatements{{Operations: []string{"local:test"}}}, + }, + } +} + +func TestBuildExecutionPlanActivityUsesSharedDeviceDefinitionsForLocalSudo(t *testing.T) { + t.Parallel() + + cfg := DefaultConfig() + cfg.AddProvider("local-elevation", newExecutionPlanActivityTestProvider("local")) + + activities := &thandActivities{ + config: cfg, + lookupDeviceDefinition: func(ctx context.Context, deviceID string) (*models.Device, error) { + return &models.Device{ + ID: deviceID, + Name: "Device Alpha", + Enabled: true, + LocalElevation: &models.DeviceLocalElevationPolicy{ + Enabled: true, + AllowedModes: []string{string(models.LocalSudoModeTimed)}, + Accounts: []models.DeviceLocalElevationAccount{ + {Email: "user@example.com", LocalUsername: "workstation-user"}, + }, + DeniedUsernames: []string{"root"}, + AllowedUIDRanges: []string{"1000-60000"}, + }, + }, nil + }, + } + + plan, err := activities.BuildExecutionPlan(context.Background(), models.ExecutionPlanRequest{ + WorkflowID: "wf-local-sudo", + ElevateRequest: &models.ElevateRequestInternal{ + ElevateRequest: models.ElevateRequest{ + Role: newExecutionPlanActivityRole(models.LocalSudoRoleIdentifier, "Local Sudo"), + Providers: []string{"local-elevation"}, + Workflow: models.LocalSudoTimedWorkflowName, + Device: "device-alpha", + Reason: "maintenance", + Duration: "30m", + Identities: []string{"user@example.com"}, + Metadata: models.LocalSudoRequestMetadata{ + Mode: models.LocalSudoModeTimed, + }.AsMap(), + }, + }, + }) + require.NoError(t, err) + require.Len(t, plan.Entries, 1) + + meta, err := models.DecodeLocalSudoRequestMetadata(plan.Entries[0].AuthorizeRequest.Metadata) + require.NoError(t, err) + assert.Equal(t, "device-alpha", plan.Entries[0].DeviceID) + assert.Equal(t, "workstation-user", meta.LocalUsername) + assert.Equal(t, []string{"root"}, meta.DeniedUsernames) +} + +func TestBuildExecutionPlanActivityFailsWhenSharedDeviceDefinitionIsMissing(t *testing.T) { + t.Parallel() + + cfg := DefaultConfig() + cfg.AddProvider("local-elevation", newExecutionPlanActivityTestProvider("local")) + + activities := &thandActivities{ + config: cfg, + lookupDeviceDefinition: func(ctx context.Context, deviceID string) (*models.Device, error) { + return nil, assert.AnError + }, + } + + _, err := activities.BuildExecutionPlan(context.Background(), models.ExecutionPlanRequest{ + WorkflowID: "wf-local-sudo", + ElevateRequest: &models.ElevateRequestInternal{ + ElevateRequest: models.ElevateRequest{ + Role: newExecutionPlanActivityRole(models.LocalSudoRoleIdentifier, "Local Sudo"), + Providers: []string{"local-elevation"}, + Workflow: models.LocalSudoTimedWorkflowName, + Device: "device-alpha", + Reason: "maintenance", + Duration: "30m", + Identities: []string{"user@example.com"}, + Metadata: models.LocalSudoRequestMetadata{ + Mode: models.LocalSudoModeTimed, + }.AsMap(), + }, + }, + }) + require.Error(t, err) + assert.ErrorContains(t, err, "assert.AnError general error for testing") +} diff --git a/internal/config/local_sudo_execution_plan.go b/internal/config/local_sudo_execution_plan.go new file mode 100644 index 00000000..d0dd9273 --- /dev/null +++ b/internal/config/local_sudo_execution_plan.go @@ -0,0 +1,107 @@ +package config + +import ( + "fmt" + "strings" + + "github.com/thand-io/agent/internal/models" +) + +type localSudoExecutionPlanDecorator struct{} + +func (localSudoExecutionPlanDecorator) Applies(elevateRequest *models.ElevateRequestInternal) bool { + return elevateRequest != nil && models.IsLocalSudoRequest(&elevateRequest.ElevateRequest) +} + +func (localSudoExecutionPlanDecorator) Decorate( + cfg models.ConfigImpl, + req *models.WorkflowRoleRequest, + elevateRequest *models.ElevateRequestInternal, + opts executionPlanBuildOptions, +) error { + meta, err := buildLocalSudoRequestMetadata(cfg, elevateRequest, req.Identity, req.ResolvedIdentity, opts.LookupDeviceDefinition) + if err != nil { + return err + } + + req.DeviceID = meta.DeviceID + req.Metadata = meta.AsMap() + return nil +} + +func (localSudoExecutionPlanDecorator) Finalize( + req *models.WorkflowRoleRequest, + elevateRequest *models.ElevateRequestInternal, + entryID string, +) error { + meta, err := models.DecodeLocalSudoRequestMetadata(req.Metadata) + if err != nil { + return err + } + meta.GrantID = entryID + req.Metadata = meta.AsMap() + return nil +} + +func buildLocalSudoRequestMetadata( + cfg models.ConfigImpl, + elevateRequest *models.ElevateRequestInternal, + identityID string, + resolvedIdentity *models.Identity, + lookupDeviceDefinition func(deviceID string) (*models.Device, error), +) (models.LocalSudoRequestMetadata, error) { + meta, err := models.DecodeLocalSudoRequestMetadata(elevateRequest.Metadata) + if err != nil { + return meta, err + } + + deviceID := strings.TrimSpace(elevateRequest.Device) + if deviceID == "" { + deviceID = strings.TrimSpace(meta.DeviceID) + } + if deviceID == "" { + return meta, fmt.Errorf("local sudo request is missing a device_id") + } + + if lookupDeviceDefinition == nil { + lookupDeviceDefinition = cfg.GetDevice + } + + device, err := lookupDeviceDefinition(deviceID) + if err != nil { + return meta, err + } + if !device.Enabled { + return meta, fmt.Errorf("device %q is disabled", deviceID) + } + if device.LocalElevation == nil { + return meta, fmt.Errorf("device %q does not have local elevation configured", deviceID) + } + if !device.LocalElevation.AllowsMode(string(meta.Mode)) { + return meta, fmt.Errorf("device %q does not allow local sudo mode %q", deviceID, meta.Mode) + } + + identity := resolvedIdentity + if identity == nil { + identity, err = cfg.GetIdentity(identityID) + } + if err != nil || identity == nil { + identity = &models.Identity{ + ID: identityID, + User: &models.User{ + Email: identityID, + }, + } + } + + localUsername, err := device.LocalElevation.ResolveLocalUsername(identityID, identity) + if err != nil { + return meta, err + } + + meta.DeviceID = device.ID + meta.LocalUsername = localUsername + meta.DeniedUsernames = append([]string(nil), device.LocalElevation.DeniedUsernames...) + meta.AllowedUIDRanges = append([]string(nil), device.LocalElevation.AllowedUIDRanges...) + return meta, nil +} diff --git a/internal/config/model.go b/internal/config/model.go index 3d820544..eb432a39 100644 --- a/internal/config/model.go +++ b/internal/config/model.go @@ -54,6 +54,7 @@ type Config struct { Roles RoleConfig `mapstructure:"roles"` Workflows WorkflowConfig `mapstructure:"workflows"` // These are workflows to run for role associated workflows Providers ProviderDefinitionsConfig `mapstructure:"providers"` // These are integration providers like AWS, GCP, etc. + Devices DeviceDefinitionsConfig `mapstructure:"devices"` // Device definitions and per-device policy managed by the server // This is ONLY if the agent is running in server mode // and you want to use https://www.thand.io hosted services @@ -64,12 +65,23 @@ type Config struct { logger thandLogger mu sync.RWMutex + // Incremented whenever synced config definitions actually change. + // Definition maps should be treated as immutable snapshots: callers should + // replace whole entries or whole maps rather than mutating nested state + // in place. Legacy mutation-prone paths are being tracked in issue #306. + configGeneration uint64 + // Cached services client initializeServiceClientOnce sync.Once servicesClient models.ServicesClientImpl // Provider instances providerInstances map[string]models.Provider + providerBindings map[string]struct{} + + // Device runtime state. + deviceConnections map[string]*models.DeviceConnectionState + deviceConnectionsMu sync.RWMutex } func (c *Config) GetSecret() string { @@ -117,6 +129,10 @@ func (c *Config) GetProvidersConfig() *ProviderDefinitionsConfig { return &c.Providers } +func (c *Config) GetDevicesConfig() *DeviceDefinitionsConfig { + return &c.Devices +} + func (c *Config) GetThandConfig() *models.ThandConfig { return &c.Thand } @@ -232,6 +248,16 @@ func (p *ProviderDefinitionsConfig) GetDefinitions() map[string]models.ProviderC return p.Definitions } +type DeviceDefinitionsConfig struct { + Path string `mapstructure:"path" json:"path"` + + Definitions map[string]models.Device `mapstructure:",remain" json:"definitions"` +} + +func (d *DeviceDefinitionsConfig) GetDefinitions() map[string]models.Device { + return d.Definitions +} + type ProviderPluginConfig struct { Path string `mapstructure:"path"` URL string `mapstructure:"url"` @@ -280,7 +306,7 @@ func (c *Config) GetThandServerUrl() string { } func (c *Config) DiscoverThandServerApiUrl() string { - return c.discoverServerApiUrl(c.Thand.Endpoint, &model.ReferenceableAuthenticationPolicy{ + return c.discoverServerApiUrl("Thand server", c.Thand.Endpoint, &model.ReferenceableAuthenticationPolicy{ AuthenticationPolicy: &model.AuthenticationPolicy{ Bearer: &model.BearerAuthenticationPolicy{ Token: c.Thand.ApiKey, @@ -290,11 +316,12 @@ func (c *Config) DiscoverThandServerApiUrl() string { } func (c *Config) DiscoverLoginServerApiUrl(loginServer string) string { - return c.discoverServerApiUrl(loginServer, nil) + return c.discoverServerApiUrl("login server", loginServer, nil) } func (c *Config) discoverServerApiUrl( - loginServer string, + serviceName string, + serverURL string, auth *model.ReferenceableAuthenticationPolicy, ) string { @@ -302,8 +329,8 @@ func (c *Config) discoverServerApiUrl( // /.well-known/api-configuration endpoint // to get the base param which is our api endpoint using resty - discoveryCheckUrl := fmt.Sprintf("%s/.well-known/api-configuration", loginServer) - defaultUrl := fmt.Sprintf("%s/api/v1", loginServer) + discoveryCheckUrl := fmt.Sprintf("%s/.well-known/api-configuration", serverURL) + defaultUrl := fmt.Sprintf("%s/api/v1", serverURL) resp, err := common.InvokeHttpRequest(&model.HTTPArguments{ Endpoint: &model.Endpoint{ @@ -334,12 +361,12 @@ func (c *Config) discoverServerApiUrl( } if len(discoveryCheckResponse.BaseUrl) > 0 { - logrus.Debugf("Discovered login server base URL: %s", discoveryCheckResponse.BaseUrl) - loginServer = strings.TrimSuffix(discoveryCheckResponse.BaseUrl, "/") + logrus.Debugf("Discovered %s base URL: %s", serviceName, discoveryCheckResponse.BaseUrl) + serverURL = strings.TrimSuffix(discoveryCheckResponse.BaseUrl, "/") } trimPath := strings.TrimSuffix(strings.TrimPrefix(discoveryCheckResponse.ApiBasePath, "/"), "/") - return fmt.Sprintf("%s/%s", loginServer, trimPath) + return fmt.Sprintf("%s/%s", serverURL, trimPath) } func (c *Config) GetLoginServerHostname() string { diff --git a/internal/config/model_test.go b/internal/config/model_test.go new file mode 100644 index 00000000..eb59440c --- /dev/null +++ b/internal/config/model_test.go @@ -0,0 +1,66 @@ +package config + +import ( + "net/http" + "net/http/httptest" + "testing" + + "github.com/sirupsen/logrus" + logrustest "github.com/sirupsen/logrus/hooks/test" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestDiscoverLoginServerApiUrl_LogsLoginServerDiscovery(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + require.Equal(t, "/.well-known/api-configuration", r.URL.Path) + w.Header().Set("Content-Type", "application/json") + _, err := w.Write([]byte(`{"baseUrl":"https://auth.example.com","apiBasePath":"/api/v1"}`)) + require.NoError(t, err) + })) + t.Cleanup(server.Close) + + hook := logrustest.NewGlobal() + defer hook.Reset() + + oldLevel := logrus.GetLevel() + logrus.SetLevel(logrus.DebugLevel) + defer logrus.SetLevel(oldLevel) + + config := &Config{} + + apiURL := config.DiscoverLoginServerApiUrl(server.URL) + + require.Equal(t, "https://auth.example.com/api/v1", apiURL) + lastEntry := hook.LastEntry() + require.NotNil(t, lastEntry) + assert.Equal(t, "Discovered login server base URL: https://auth.example.com", lastEntry.Message) +} + +func TestDiscoverThandServerApiUrl_LogsThandServerDiscovery(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + require.Equal(t, "/.well-known/api-configuration", r.URL.Path) + w.Header().Set("Content-Type", "application/json") + _, err := w.Write([]byte(`{"baseUrl":"https://config.example.com","apiBasePath":"/api/v1"}`)) + require.NoError(t, err) + })) + t.Cleanup(server.Close) + + hook := logrustest.NewGlobal() + defer hook.Reset() + + oldLevel := logrus.GetLevel() + logrus.SetLevel(logrus.DebugLevel) + defer logrus.SetLevel(oldLevel) + + config := &Config{} + config.Thand.Endpoint = server.URL + config.Thand.ApiKey = "test-token" + + apiURL := config.DiscoverThandServerApiUrl() + + require.Equal(t, "https://config.example.com/api/v1", apiURL) + lastEntry := hook.LastEntry() + require.NotNil(t, lastEntry) + assert.Equal(t, "Discovered Thand server base URL: https://config.example.com", lastEntry.Message) +} diff --git a/internal/config/providers.go b/internal/config/providers.go index 747ffb8a..55fe920e 100644 --- a/internal/config/providers.go +++ b/internal/config/providers.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "strings" + "sync" "github.com/hashicorp/go-version" "github.com/sirupsen/logrus" @@ -31,6 +32,8 @@ import ( _ "github.com/thand-io/agent/internal/providers/thand" ) +var providerBindingsMu sync.Mutex + // LoadProviders loads providers from a file or URL and maps them to their implementations func (c *Config) LoadProviders() (map[string]models.ProviderConfig, error) { @@ -230,130 +233,18 @@ func (c *Config) InitializeProviders() error { providerResult := result.provider - // Check for capabilities for RBAC and Identities - if providerResult.HasAnyCapability( - models.ProviderCapabilityIdentities, - models.ProviderCapabilityUsers, - models.ProviderCapabilityGroups, - models.ProviderCapabilityResources, - models.ProviderCapabilityRoles, - models.ProviderCapabilityPermissions, - models.ProviderCapabilityTenants, - ) { + if providerNeedsTemporalBindings(providerResult) { + logrus.Infoln("Provider", result.key, "supports Temporal provider bindings") - logrus.Infoln("Provider", result.key, "supports RBAC/Identities capabilities") + if err := c.registerProviderTemporalBindings(providerResult); err != nil { + logrus.WithError(err).Errorln("Failed to register Temporal bindings for provider:", result.key) + continue + } - // Register provider workflows and activities with Temporal if available if c.IsServer() { - - if c.GetServices() != nil && c.GetServices().HasTemporal() { - - logrus.Infoln("Registering Temporal workflows/activities for provider", result.key) - - temporalService := c.GetServices().GetTemporal() - - worker := temporalService.GetWorker() - - if worker == nil { - logrus.Errorln("Temporal client is configured but worker is nil, cannot register workflows/activities for provider", result.key) - continue - } - - syncWorkflowName := models.CreateTemporalProviderWorkflowName( - providerResult.GetIdentifier(), - models.TemporalSynchronizeWorkflowName, - ) - - logrus.WithFields(logrus.Fields{ - "workflow": syncWorkflowName, - }).Infoln("Registering provider synchronize workflow with name", syncWorkflowName) - - // Register the provider Synchronize workflow. This updates roles, permissions, - // resources and identities for RBAC. We register this on the provider itself since it's a core part of the provider's functionality, but we register all other workflows and activities separately to allow providers to opt out of Temporal if they want. - worker.RegisterWorkflowWithOptions( - models.CreateProviderSynchronizeWorkflow(providerResult), - workflow.RegisterOptions{ - Name: syncWorkflowName, - VersioningBehavior: workflow.VersioningBehaviorPinned, - }, - ) - - if providerResult.HasCapability(models.ProviderCapabilityProvisioning) { - - authWorkflowName := models.CreateTemporalProviderWorkflowName( - providerResult.GetIdentifier(), - models.TemporalAuthorizeRoleWorkflowName) - - logrus.WithFields(logrus.Fields{ - "workflow": authWorkflowName, - "provider": providerResult.GetIdentifier(), - }).Infoln("Registering provider authorize role workflow with name", authWorkflowName) - - // Register the provider-specific authorize and revoke role workflows. - // These are closure-based: they capture the live provider instance so the - // child workflow can call provider.AuthorizeRole / RevokeRole with a - // full workflow.Context, allowing providers to dispatch activities, - // use workflow.Go, etc. - worker.RegisterWorkflowWithOptions( - models.CreateProviderAuthorizeRoleWorkflow(providerResult), - workflow.RegisterOptions{ - Name: authWorkflowName, - VersioningBehavior: workflow.VersioningBehaviorPinned, - }, - ) - - revokeWorkflowName := models.CreateTemporalProviderWorkflowName( - providerResult.GetIdentifier(), - models.TemporalRevokeRoleWorkflowName) - - logrus.WithFields(logrus.Fields{ - "workflow": revokeWorkflowName, - "provider": providerResult.GetIdentifier(), - }).Infoln("Registering provider revoke role workflow with name", revokeWorkflowName) - - worker.RegisterWorkflowWithOptions( - models.CreateProviderRevokeRoleWorkflow(providerResult), - workflow.RegisterOptions{ - Name: revokeWorkflowName, - VersioningBehavior: workflow.VersioningBehaviorPinned, - }, - ) - } - // Register all custom provider workflows - workflowsRegistry := providerResult.RegisterWorkflows() - if workflowsRegistry != nil { - logrus.Infoln("Registering Temporal workflows for provider", result.key) - worker.RegisterWorkflow(workflowsRegistry) - } - - // Register default provider activities - err := models.RegisterProviderActivities(temporalService, providerResult, c) - if err != nil { - logrus.WithError(err).Errorln("Failed to register default activities for provider:", result.key) - continue - } - - customActivities := providerResult.RegisterActivities() - if customActivities != nil { - // Now register any custom activities defined by the provider - err = models.RegisterActivities( - temporalService, - providerResult.GetIdentifier(), - customActivities, - ) - if err != nil { - logrus.WithError(err).Errorln("Failed to register custom activities for provider:", result.key) - continue - } - } - } - logrus.Infoln("Synchronizing provider", result.key) c.synchronizeProvider(result.provider) - } else { - logrus.Infoln("Skipping Temporal registration for provider", result.key, "in non-server mode") - // Non-server mode: provider won't be synchronized, mark ready immediately providerResult.SetReady() } } else { @@ -379,6 +270,127 @@ func (c *Config) InitializeProviders() error { return nil } +func (c *Config) registerProviderTemporalBindings(providerResult models.Provider) error { + if providerResult == nil { + return fmt.Errorf("provider is nil") + } + if c.GetServices() == nil || !c.GetServices().HasTemporal() { + logrus.WithFields(logrus.Fields{ + "provider": providerResult.GetIdentifier(), + "mode": c.GetMode(), + }).Info("Skipping provider Temporal registration because Temporal is unavailable") + return nil + } + + providerBindingsMu.Lock() + defer providerBindingsMu.Unlock() + + if c.providerBindings == nil { + c.providerBindings = map[string]struct{}{} + } + if _, exists := c.providerBindings[providerResult.GetIdentifier()]; exists { + return nil + } + + // Provider bindings should stay on operational workers and never leak onto + // the shared device-registry queue. + temporalService := c.getOperationalTemporalService() + worker := temporalService.GetWorker() + if worker == nil { + return fmt.Errorf("temporal client is configured but worker is nil") + } + + syncWorkflowName := models.CreateTemporalProviderWorkflowName( + providerResult.GetIdentifier(), + models.TemporalSynchronizeWorkflowName, + ) + + worker.RegisterWorkflowWithOptions( + models.CreateProviderSynchronizeWorkflow(providerResult), + workflow.RegisterOptions{ + Name: syncWorkflowName, + VersioningBehavior: workflow.VersioningBehaviorPinned, + }, + ) + + if providerResult.HasCapability(models.ProviderCapabilityProvisioning) { + authWorkflowName := models.CreateTemporalProviderWorkflowName( + providerResult.GetIdentifier(), + models.TemporalAuthorizeRoleWorkflowName, + ) + worker.RegisterWorkflowWithOptions( + models.CreateProviderAuthorizeRoleWorkflow(providerResult), + workflow.RegisterOptions{ + Name: authWorkflowName, + VersioningBehavior: workflow.VersioningBehaviorPinned, + }, + ) + + revokeWorkflowName := models.CreateTemporalProviderWorkflowName( + providerResult.GetIdentifier(), + models.TemporalRevokeRoleWorkflowName, + ) + worker.RegisterWorkflowWithOptions( + models.CreateProviderRevokeRoleWorkflow(providerResult), + workflow.RegisterOptions{ + Name: revokeWorkflowName, + VersioningBehavior: workflow.VersioningBehaviorPinned, + }, + ) + } + + if workflowsRegistry := providerResult.RegisterWorkflows(); workflowsRegistry != nil { + worker.RegisterWorkflow(workflowsRegistry) + } + + if err := models.RegisterProviderActivities(temporalService, providerResult, c); err != nil { + return err + } + + if customActivities := providerResult.RegisterActivities(); customActivities != nil { + if err := models.RegisterActivities(temporalService, providerResult.GetIdentifier(), customActivities); err != nil { + return err + } + } + + c.providerBindings[providerResult.GetIdentifier()] = struct{}{} + return nil +} + +func providerNeedsTemporalBindings(provider models.Provider) bool { + return provider.HasAnyCapability( + models.ProviderCapabilityIdentities, + models.ProviderCapabilityUsers, + models.ProviderCapabilityGroups, + models.ProviderCapabilityResources, + models.ProviderCapabilityRoles, + models.ProviderCapabilityPermissions, + models.ProviderCapabilityTenants, + models.ProviderCapabilityNotifier, + models.ProviderCapabilityWebhook, + ) +} + +func (c *Config) EnsureProviderTemporalBindings() error { + c.mu.RLock() + providers := make([]models.Provider, 0, len(c.providerInstances)) + for _, provider := range c.providerInstances { + providers = append(providers, provider) + } + c.mu.RUnlock() + + for _, provider := range providers { + if !providerNeedsTemporalBindings(provider) { + continue + } + if err := c.registerProviderTemporalBindings(provider); err != nil { + return err + } + } + + return nil +} + // initializeSingleProvider initializes a single provider func (c *Config) initializeSingleProvider(providerKey string, p *models.ProviderConfig) (models.Provider, error) { @@ -447,6 +459,10 @@ func (c *Config) GetProviders() ProviderDefinitionsConfig { return c.Providers } +func (c *Config) GetProviderDefinitions() map[string]models.ProviderConfig { + return c.Providers.Definitions +} + func (c *Config) GetProvider(providerName string) (string, models.Provider, error) { // Get the first provider by provider name @@ -513,7 +529,7 @@ func (c *Config) GetProvidersByCapabilityWithUser(user *models.User, capability if len(capability) != 0 && !provider.HasAnyCapability(capability...) { logrus.WithFields(logrus.Fields{ "capabilities": provider.GetCapabilities(), - }).Debugln("Skipping provider", name, "due to missing capability:", capability) + }).Traceln("Skipping provider", name, "due to missing capability:", capability) continue } diff --git a/internal/config/providers_local.go b/internal/config/providers_local.go new file mode 100644 index 00000000..d34c9d4b --- /dev/null +++ b/internal/config/providers_local.go @@ -0,0 +1,7 @@ +package config + +import ( + _ "github.com/thand-io/agent/internal/providers/local" + _ "github.com/thand-io/agent/internal/providers/localnotification" + _ "github.com/thand-io/agent/internal/providers/localpresence" +) diff --git a/internal/config/services.go b/internal/config/services.go index 45d1d3be..2acea08e 100644 --- a/internal/config/services.go +++ b/internal/config/services.go @@ -1,6 +1,7 @@ package config import ( + "context" "fmt" "github.com/sirupsen/logrus" @@ -29,8 +30,8 @@ func (c *Config) SetupTemporal() error { logrus.Infoln("Setting up temporal services...") - if !c.IsServer() { - return fmt.Errorf("temporal services can only be set up in server mode") + if !c.IsServer() && !c.IsAgent() { + return fmt.Errorf("temporal services can only be set up in server or agent mode") } // Register workflows @@ -45,6 +46,10 @@ func (c *Config) SetupTemporal() error { return fmt.Errorf("registering temporal activities: %w", err) } + if err := c.EnsureProviderTemporalBindings(); err != nil { + return fmt.Errorf("registering provider temporal bindings: %w", err) + } + return nil } @@ -66,5 +71,16 @@ func (c *Config) StartTemporalWorkers() error { return fmt.Errorf("starting temporal workers: %w", err) } + // Device registry workflow management calls GetClient(), which blocks + // until workers have started. Run after StartWorkers so the client is + // ready, instead of during registration in SetupTemporal where it would + // deadlock. + if err := c.EnsureDeviceRegistryWorkflows(context.Background()); err != nil { + return fmt.Errorf("ensuring device registries: %w", err) + } + if err := c.PublishConfiguredDeviceDefinitions(context.Background()); err != nil { + return fmt.Errorf("publishing device definitions: %w", err) + } + return nil } diff --git a/internal/config/services/client.go b/internal/config/services/client.go index c3c36204..e29435ad 100644 --- a/internal/config/services/client.go +++ b/internal/config/services/client.go @@ -410,6 +410,11 @@ func (e *localClient) ReloadTemporal() error { logrus.WithField("identities", identities).Info("Configuring Temporal workers for agent mode") } + if e.config.IsServer() { + identities = append(identities, models.TemporalDeviceRegistryTaskQueue) + logrus.WithField("identities", identities).Info("Configuring Temporal workers for server mode") + } + // Get Temporal config from services servicesConfig := e.config.GetServicesConfig() diff --git a/internal/config/services/temporal/main.go b/internal/config/services/temporal/main.go index abc25290..3907b849 100644 --- a/internal/config/services/temporal/main.go +++ b/internal/config/services/temporal/main.go @@ -62,6 +62,40 @@ func NewTemporalClient( } } +func (a *TemporalClient) shouldUseVersioning(identity string) bool { + if a.config.DisableVersioning { + return false + } + // Keep the shared device-registry queue unversioned. Its singleton + // workflows are internal infrastructure and are reconstructed from server + // startup publication plus agent route refreshes, so the operational + // versioned deployment path is unnecessary here and has proven brittle. + return identity != models.TemporalDeviceRegistryTaskQueue +} + +func (a *TemporalClient) workerOptionsForIdentity(identity string, buildID string) worker.Options { + workerOptions := worker.Options{ + Identity: a.GetIdentity(), + MaxConcurrentActivityTaskPollers: 5, + } + + if !a.shouldUseVersioning(identity) { + return workerOptions + } + + workerOptions.DeploymentOptions = worker.DeploymentOptions{ + UseVersioning: true, + Version: worker.WorkerDeploymentVersion{ + DeploymentName: sdkConstants.TemporalDeploymentName, + BuildID: buildID, + }, + // Default workflows to Pinned behavior + DefaultVersioningBehavior: workflow.VersioningBehaviorPinned, + } + + return workerOptions +} + func (a *TemporalClient) Initialize() error { if len(a.identities) == 0 { @@ -109,27 +143,11 @@ func (a *TemporalClient) Initialize() error { // Get agent version for Worker Build ID buildID := common.GetBuildIdentifier() - - workerOptions := worker.Options{ - Identity: a.GetIdentity(), - MaxConcurrentActivityTaskPollers: 5, - } - if !a.config.DisableVersioning { logrus.WithFields(logrus.Fields{ "BuildID": buildID, "DeploymentName": sdkConstants.TemporalDeploymentName, }).Info("Configuring Worker with versioning") - - workerOptions.DeploymentOptions = worker.DeploymentOptions{ - UseVersioning: true, - Version: worker.WorkerDeploymentVersion{ - DeploymentName: sdkConstants.TemporalDeploymentName, - BuildID: buildID, - }, - // Default workflows to Pinned behavior - DefaultVersioningBehavior: workflow.VersioningBehaviorPinned, - } } // Create a worker for each identity (task queue). @@ -143,6 +161,11 @@ func (a *TemporalClient) Initialize() error { } for _, identity := range a.identities { + workerOptions := a.workerOptionsForIdentity(identity, buildID) + if !workerOptions.DeploymentOptions.UseVersioning && !a.config.DisableVersioning && identity == models.TemporalDeviceRegistryTaskQueue { + logrus.WithField("taskQueue", identity).Info("Starting Temporal worker without versioning for shared device registry queue") + } + newWorker := worker.New( temporalClient, identity, @@ -182,6 +205,7 @@ func (c *TemporalClient) StartWorkers() error { buildID := common.GetBuildIdentifier() startedCount := 0 + hasVersionedWorkers := false for identity, w := range c.workers { logrus.WithFields(logrus.Fields{ @@ -197,6 +221,9 @@ func (c *TemporalClient) StartWorkers() error { continue } + if c.shouldUseVersioning(identity) { + hasVersionedWorkers = true + } startedCount++ } @@ -209,7 +236,7 @@ func (c *TemporalClient) StartWorkers() error { // If versioning is enabled, confirm our deployment version is registered // on the Temporal server before allowing workflow submissions via GetClient(). - if c.config.DisableVersioning { + if c.config.DisableVersioning || !hasVersionedWorkers { c.markReady() } else { go c.awaitVersionRegistration(buildID) diff --git a/internal/config/services/temporal/versioning_test.go b/internal/config/services/temporal/versioning_test.go new file mode 100644 index 00000000..e6b5741e --- /dev/null +++ b/internal/config/services/temporal/versioning_test.go @@ -0,0 +1,49 @@ +package temporal + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/thand-io/agent/internal/models" +) + +func TestShouldUseVersioningForIdentity(t *testing.T) { + t.Parallel() + + client := NewTemporalClient( + &models.TemporalConfig{ + Host: "localhost", + Port: 7233, + Namespace: "default", + DisableVersioning: false, + }, + nil, + "thand_local_alpha_server01", + models.TemporalDeviceRegistryTaskQueue, + ) + + assert.True(t, client.shouldUseVersioning("thand_local_alpha_server01")) + assert.False(t, client.shouldUseVersioning(models.TemporalDeviceRegistryTaskQueue)) +} + +func TestWorkerOptionsForIdentityKeepsRegistryQueueUnversioned(t *testing.T) { + t.Parallel() + + client := NewTemporalClient( + &models.TemporalConfig{ + Host: "localhost", + Port: 7233, + Namespace: "default", + DisableVersioning: false, + }, + nil, + "thand_local_alpha_server01", + models.TemporalDeviceRegistryTaskQueue, + ) + + operational := client.workerOptionsForIdentity("thand_local_alpha_server01", "build-123") + registry := client.workerOptionsForIdentity(models.TemporalDeviceRegistryTaskQueue, "build-123") + + assert.True(t, operational.DeploymentOptions.UseVersioning) + assert.False(t, registry.DeploymentOptions.UseVersioning) +} diff --git a/internal/config/sync.go b/internal/config/sync.go index fc6abdd1..12550f74 100644 --- a/internal/config/sync.go +++ b/internal/config/sync.go @@ -1,6 +1,7 @@ package config import ( + "bytes" "encoding/json" "fmt" "net/http" @@ -12,6 +13,34 @@ import ( "github.com/thand-io/agent/internal/models" ) +const maxMergeConfigurationRetries = 3 + +type configPatchSnapshot struct { + generation uint64 + request ConfigPatchRequest + data []byte + roleDefinitionsJSON []byte + workflowDefinitionsJSON []byte + providerDefinitionsJSON []byte +} + +type buildMergedConfigResult struct { + config ConfigPatchRequest + outgoingPatch []byte +} + +type normalizedDefinitionsPatch struct { + roleDefinitions map[string]models.Role + roleDefinitionsJSON []byte + roleDefinitionsChanged bool + workflowDefinitions map[string]models.Workflow + workflowDefinitionsJSON []byte + workflowDefinitionsChanged bool + providerDefinitions map[string]models.ProviderConfig + providerDefinitionsJSON []byte + providerDefinitionsChanged bool +} + type ConfigPatchRequest struct { RoleConfig *RoleConfig `json:"roles,omitempty"` WorkflowConfig *WorkflowConfig `json:"workflows,omitempty"` @@ -33,113 +62,190 @@ func (c *Config) MergeConfiguration(config *RegistrationResponse) error { return err } - roles := c.GetRolesConfig() - workflows := c.GetWorkflowsConfig() - providers := c.GetProvidersConfig() + outgoingPatch, err := c.applyMergedConfigWithRetries(func(snapshot *configPatchSnapshot) (*buildMergedConfigResult, error) { + // Apply the incoming changes over the existing configurations. + newData, err := jsonpatch.MergePatch(snapshot.data, incomingData) + if err != nil { + logrus.WithError(err).Errorln("Failed to create merge patch for configuration diffing") + return nil, err + } - existing := ConfigPatchRequest{ - RoleConfig: roles, - WorkflowConfig: workflows, - ProviderConfig: providers, - } + // Convert the merged configuration back to a struct so the in-memory + // config reflects the fully merged remote+local state rather than a + // sparse diff payload. Unmarshaling the sparse merge patch here would + // collapse omitted fields to zero values in typed structs. + var mergedConfig ConfigPatchRequest + err = json.Unmarshal(newData, &mergedConfig) + if err != nil { + logrus.WithError(err).Errorln("Failed to unmarshal merged configuration") + return nil, err + } - existingData, err := json.Marshal(existing) + // Now we need to figure out what changes exist on the local system that need to + // be sent back to the server + outgoingPatch, err := jsonpatch.CreateMergePatch(incomingData, snapshot.data) + if err != nil { + logrus.WithError(err).Errorln("Failed to create merge patch for configuration diffing") + return nil, err + } + return &buildMergedConfigResult{ + config: mergedConfig, + outgoingPatch: outgoingPatch, + }, nil + }) if err != nil { - logrus.WithError(err).Errorln("Failed to marshal existing configuration for diffing") return err } - // Apply the incoming changes over the existing configurations - newData, err := jsonpatch.MergePatch(existingData, incomingData) + if !c.HasThandService() { + logrus.Debugln("Skipping configuration push-back sync because no Thand service is configured") + return nil + } - if err != nil { - logrus.WithError(err).Errorln("Failed to create merge patch for configuration diffing") - return err + if !c.Thand.Sync { + logrus.Debugln("Skipping configuration push-back sync because thand.sync is disabled") + return nil } - // Create a patch to see the differences between existing and new - incomingPatch, err := jsonpatch.CreateMergePatch(existingData, newData) + c.sendConfigurationPatch(outgoingPatch) - if err != nil { - logrus.WithError(err).Errorln("Failed to create merge patch for configuration diffing") - return err - } + return nil - // Convert patches back to structs - these are the NEW changes from the remote - // server that we need to apply to our existing configuration - var incomingDiff ConfigPatchRequest - err = json.Unmarshal(incomingPatch, &incomingDiff) +} - if err != nil { - logrus.WithError(err).Errorln("Failed to unmarshal incoming patch") - return err - } +func (c *Config) sendConfigurationPatch(outgoingPatch []byte) { + logrus.Debugln("Sending configuration updates back to server") - // Add these new changes to our existing configuration - err = c.applyPatch(incomingDiff) + url := fmt.Sprintf("%s/sync", c.DiscoverThandServerApiUrl()) - if err != nil { - logrus.WithError(err).Errorln("Failed to apply incoming configuration patch") - return err + authentication := &model.ReferenceableAuthenticationPolicy{ + AuthenticationPolicy: &model.AuthenticationPolicy{ + Bearer: &model.BearerAuthenticationPolicy{ + Token: c.Thand.ApiKey, + }, + }, } - // Now we need to figure out what changes exist on the local system that need to - // be sent back to the server - - outgoingPatch, err := jsonpatch.CreateMergePatch(incomingData, existingData) + resp, err := common.InvokeHttpRequest(&model.HTTPArguments{ + Method: http.MethodPatch, + Endpoint: &model.Endpoint{ + EndpointConfig: &model.EndpointConfiguration{ + URI: &model.LiteralUri{Value: url}, + Authentication: authentication, + }, + }, + Body: outgoingPatch, + }) if err != nil { - logrus.WithError(err).Errorln("Failed to create merge patch for configuration diffing") - return err + logrus.WithError(err).Errorln("Failed to send configuration updates to server") + return } - // Send the outgoing changes back to the server to update its configuration - - go func() { - - logrus.Debugln("Sending configuration updates back to server") - - url := fmt.Sprintf("%s/sync", c.DiscoverThandServerApiUrl()) + if resp.StatusCode() != http.StatusOK { + logrus.WithField("status_code", resp.StatusCode()).Errorln("Failed to send configuration updates to server") + } else { + logrus.Infoln("Successfully sent configuration updates to server") + } +} - authentication := &model.ReferenceableAuthenticationPolicy{ - AuthenticationPolicy: &model.AuthenticationPolicy{ - Bearer: &model.BearerAuthenticationPolicy{ - Token: c.Thand.ApiKey, - }, - }, +func (c *Config) applyMergedConfigWithRetries(build func(snapshot *configPatchSnapshot) (*buildMergedConfigResult, error)) ([]byte, error) { + for attempt := range maxMergeConfigurationRetries { + snapshot, err := c.snapshotConfigPatch() + if err != nil { + logrus.WithError(err).Errorln("Failed to marshal existing configuration for diffing") + return nil, err } - resp, err := common.InvokeHttpRequest(&model.HTTPArguments{ - Method: http.MethodPatch, - Endpoint: &model.Endpoint{ - EndpointConfig: &model.EndpointConfiguration{ - URI: &model.LiteralUri{Value: url}, - Authentication: authentication, - }, - }, - Body: outgoingPatch, - }) - + result, err := build(snapshot) if err != nil { - logrus.WithError(err).Errorln("Failed to send configuration updates to server") - return + return nil, err } - if resp.StatusCode() != http.StatusOK { - logrus.WithField("status_code", resp.StatusCode()).Errorln("Failed to send configuration updates to server") - } else { - logrus.Infoln("Successfully sent configuration updates to server") + applied, err := c.applyMergedConfigWithSnapshot(snapshot, result.config) + if err != nil { + logrus.WithError(err).Errorln("Failed to apply incoming merged configuration") + return nil, err + } + if applied { + return result.outgoingPatch, nil } - }() + logrus.WithField("attempt", attempt+1).Infoln("Configuration changed during merged sync apply, retrying") + } - return nil + logrus.WithField("attempts", maxMergeConfigurationRetries).Warnln("Configuration changed during every merged sync attempt") + return nil, fmt.Errorf("configuration changed during merge after %d attempts", maxMergeConfigurationRetries) +} + +func (c *Config) snapshotConfigPatch() (*configPatchSnapshot, error) { + c.mu.RLock() + snapshot := ConfigPatchRequest{ + RoleConfig: &RoleConfig{ + Path: c.Roles.Path, + URL: c.Roles.URL, + Vault: c.Roles.Vault, + Definitions: c.Roles.Definitions, + }, + WorkflowConfig: &WorkflowConfig{ + Path: c.Workflows.Path, + URL: c.Workflows.URL, + Vault: c.Workflows.Vault, + Plugins: c.Workflows.Plugins, + Definitions: c.Workflows.Definitions, + }, + ProviderConfig: &ProviderDefinitionsConfig{ + Path: c.Providers.Path, + URL: c.Providers.URL, + Vault: c.Providers.Vault, + Plugins: c.Providers.Plugins, + Definitions: c.Providers.Definitions, + }, + } + generation := c.configGeneration + + data, err := json.Marshal(snapshot) + c.mu.RUnlock() + if err != nil { + return nil, err + } + + // Keep sync retries isolated from in-place nested mutations by detaching the + // snapshot through the same JSON representation used for merge-patch diffing. + var detached ConfigPatchRequest + if err := json.Unmarshal(data, &detached); err != nil { + return nil, err + } + + roleDefinitionsJSON, err := marshalJSON(detachedRoleDefinitions(detached.RoleConfig)) + if err != nil { + return nil, err + } + workflowDefinitionsJSON, err := marshalJSON(detachedWorkflowDefinitions(detached.WorkflowConfig)) + if err != nil { + return nil, err + } + + providerDefinitionsJSON, err := marshalJSON(detachedProviderDefinitions(detached.ProviderConfig)) + if err != nil { + return nil, err + } + + return &configPatchSnapshot{ + generation: generation, + request: detached, + data: data, + roleDefinitionsJSON: roleDefinitionsJSON, + workflowDefinitionsJSON: workflowDefinitionsJSON, + providerDefinitionsJSON: providerDefinitionsJSON, + }, nil } func (c *Config) applyPatch(diff ConfigPatchRequest) error { - // Apply role changes + // applyPatch is the partial-patch helper: merge the incoming section diff + // with the current live section, then normalize and persist the result. if diff.RoleConfig != nil { err := c.updateRoles(diff.RoleConfig) if err != nil { @@ -148,7 +254,6 @@ func (c *Config) applyPatch(diff ConfigPatchRequest) error { } } - // Apply workflow changes if diff.WorkflowConfig != nil { err := c.updateWorkflows(diff.WorkflowConfig) if err != nil { @@ -157,7 +262,6 @@ func (c *Config) applyPatch(diff ConfigPatchRequest) error { } } - // Apply provider changes if diff.ProviderConfig != nil { err := c.updateProviders(diff.ProviderConfig) if err != nil { @@ -169,23 +273,347 @@ func (c *Config) applyPatch(diff ConfigPatchRequest) error { return nil } +// applyMergedConfig applies a fully merged server state. It normalizes each +// definitions map and stores it directly without re-merging the same sections. +func (c *Config) applyMergedConfig(config ConfigPatchRequest) error { + snapshot, err := c.snapshotConfigPatch() + if err != nil { + return err + } + + applied, err := c.applyMergedConfigWithSnapshot(snapshot, config) + if err != nil { + return err + } + if !applied { + return fmt.Errorf("configuration changed while applying merged configuration") + } + + return nil +} + +func (c *Config) applyMergedConfigWithSnapshot(snapshot *configPatchSnapshot, config ConfigPatchRequest) (bool, error) { + normalized, err := c.normalizeMergedConfig(snapshot, config) + if err != nil { + return false, err + } + + return c.commitMergedDefinitions(normalized, snapshot.generation), nil +} + +func (c *Config) normalizeMergedConfig(snapshot *configPatchSnapshot, config ConfigPatchRequest) (*normalizedDefinitionsPatch, error) { + normalized := &normalizedDefinitionsPatch{} + + if config.RoleConfig != nil { + mergedRoleDefinitionsJSON, err := marshalJSON(detachedRoleDefinitions(config.RoleConfig)) + if err != nil { + return nil, err + } + if !bytes.Equal(snapshot.roleDefinitionsJSON, mergedRoleDefinitionsJSON) { + defs, defsJSON, err := c.normalizeRoleDefinitions(detachedRoleDefinitions(config.RoleConfig)) + if err != nil { + logrus.WithError(err).Errorln("Failed to normalize merged role configuration") + return nil, err + } + normalized.roleDefinitionsJSON = defsJSON + normalized.roleDefinitionsChanged = !bytes.Equal(snapshot.roleDefinitionsJSON, defsJSON) + if normalized.roleDefinitionsChanged { + normalized.roleDefinitions = defs + } + } + } + + if config.WorkflowConfig != nil { + mergedWorkflowDefinitionsJSON, err := marshalJSON(detachedWorkflowDefinitions(config.WorkflowConfig)) + if err != nil { + return nil, err + } + if !bytes.Equal(snapshot.workflowDefinitionsJSON, mergedWorkflowDefinitionsJSON) { + defs, defsJSON, err := c.normalizeWorkflowDefinitions(detachedWorkflowDefinitions(config.WorkflowConfig)) + if err != nil { + logrus.WithError(err).Errorln("Failed to normalize merged workflow configuration") + return nil, err + } + normalized.workflowDefinitionsJSON = defsJSON + normalized.workflowDefinitionsChanged = !bytes.Equal(snapshot.workflowDefinitionsJSON, defsJSON) + if normalized.workflowDefinitionsChanged { + normalized.workflowDefinitions = defs + } + } + } + + if config.ProviderConfig != nil { + mergedProviderDefinitionsJSON, err := marshalJSON(detachedProviderDefinitions(config.ProviderConfig)) + if err != nil { + return nil, err + } + if !bytes.Equal(snapshot.providerDefinitionsJSON, mergedProviderDefinitionsJSON) { + defs, defsJSON, err := c.normalizeProviderDefinitions(detachedProviderDefinitions(config.ProviderConfig)) + if err != nil { + logrus.WithError(err).Errorln("Failed to normalize merged provider configuration") + return nil, err + } + normalized.providerDefinitionsJSON = defsJSON + normalized.providerDefinitionsChanged = !bytes.Equal(snapshot.providerDefinitionsJSON, defsJSON) + if normalized.providerDefinitionsChanged { + normalized.providerDefinitions = defs + } + } + } + + return normalized, nil +} + +func (c *Config) commitMergedDefinitions(diff *normalizedDefinitionsPatch, expectedGeneration uint64) bool { + c.mu.Lock() + defer c.mu.Unlock() + + if c.configGeneration != expectedGeneration { + return false + } + + changed := false + if diff.roleDefinitionsChanged { + c.Roles.Definitions = diff.roleDefinitions + changed = true + } + if diff.workflowDefinitionsChanged { + c.Workflows.Definitions = diff.workflowDefinitions + changed = true + } + if diff.providerDefinitionsChanged { + c.Providers.Definitions = diff.providerDefinitions + changed = true + } + if changed { + c.configGeneration++ + } + + return true +} + +func marshalJSON(value any) ([]byte, error) { + return json.Marshal(value) +} + +func mergeConfigSection(current any, incoming any, out any) error { + currentData, err := json.Marshal(current) + if err != nil { + return err + } + + incomingData, err := json.Marshal(incoming) + if err != nil { + return err + } + + mergedData, err := jsonpatch.MergePatch(currentData, incomingData) + if err != nil { + return err + } + + return json.Unmarshal(mergedData, out) +} + func (c *Config) updateRoles(roleConfig *RoleConfig) error { - _, err := c.ApplyRoles([]*models.RoleDefinitions{{ - Roles: roleConfig.Definitions, - }}) - return err + c.mu.RLock() + current := RoleConfig{ + Path: c.Roles.Path, + URL: c.Roles.URL, + Vault: c.Roles.Vault, + Definitions: c.Roles.Definitions, + } + c.mu.RUnlock() + + var merged RoleConfig + if err := mergeConfigSection(current, *roleConfig, &merged); err != nil { + return err + } + + return c.storeRoleDefinitions(merged.Definitions) } func (c *Config) updateWorkflows(workflowConfig *WorkflowConfig) error { - _, err := c.ApplyWorkflows([]*models.WorkflowDefinitions{{ - Workflows: workflowConfig.Definitions, - }}) - return err + c.mu.RLock() + current := WorkflowConfig{ + Path: c.Workflows.Path, + URL: c.Workflows.URL, + Vault: c.Workflows.Vault, + Plugins: c.Workflows.Plugins, + Definitions: c.Workflows.Definitions, + } + c.mu.RUnlock() + + var merged WorkflowConfig + if err := mergeConfigSection(current, *workflowConfig, &merged); err != nil { + return err + } + + return c.storeWorkflowDefinitions(merged.Definitions) } func (c *Config) updateProviders(providerConfig *ProviderDefinitionsConfig) error { - _, err := c.ApplyProviders([]*models.ProviderDefinitions{{ - Providers: providerConfig.Definitions, + c.mu.RLock() + current := ProviderDefinitionsConfig{ + Path: c.Providers.Path, + URL: c.Providers.URL, + Vault: c.Providers.Vault, + Plugins: c.Providers.Plugins, + Definitions: c.Providers.Definitions, + } + c.mu.RUnlock() + + var merged ProviderDefinitionsConfig + if err := mergeConfigSection(current, *providerConfig, &merged); err != nil { + return err + } + + return c.storeProviderDefinitions(merged.Definitions) +} + +func (c *Config) storeRoleDefinitions(definitions map[string]models.Role) error { + defs, defsJSON, err := c.normalizeRoleDefinitions(definitions) + if err != nil { + return err + } + + return c.commitRoleDefinitions(defs, defsJSON) +} + +func (c *Config) storeWorkflowDefinitions(definitions map[string]models.Workflow) error { + defs, defsJSON, err := c.normalizeWorkflowDefinitions(definitions) + if err != nil { + return err + } + + return c.commitWorkflowDefinitions(defs, defsJSON) +} + +func (c *Config) storeProviderDefinitions(definitions map[string]models.ProviderConfig) error { + defs, defsJSON, err := c.normalizeProviderDefinitions(definitions) + if err != nil { + return err + } + + return c.commitProviderDefinitions(defs, defsJSON) +} + +func (c *Config) normalizeRoleDefinitions(definitions map[string]models.Role) (map[string]models.Role, []byte, error) { + defs, err := (&Config{}).ApplyRoles([]*models.RoleDefinitions{{ + Roles: definitions, + }}) + if err != nil { + return nil, nil, err + } + + defsJSON, err := marshalJSON(defs) + if err != nil { + return nil, nil, err + } + + return defs, defsJSON, nil +} + +func (c *Config) normalizeWorkflowDefinitions(definitions map[string]models.Workflow) (map[string]models.Workflow, []byte, error) { + defs, err := (&Config{mode: c.mode}).ApplyWorkflows([]*models.WorkflowDefinitions{{ + Workflows: definitions, + }}) + if err != nil { + return nil, nil, err + } + + defsJSON, err := marshalJSON(defs) + if err != nil { + return nil, nil, err + } + + return defs, defsJSON, nil +} + +func (c *Config) normalizeProviderDefinitions(definitions map[string]models.ProviderConfig) (map[string]models.ProviderConfig, []byte, error) { + defs, err := (&Config{}).ApplyProviders([]*models.ProviderDefinitions{{ + Providers: definitions, }}) - return err + if err != nil { + return nil, nil, err + } + + defsJSON, err := marshalJSON(defs) + if err != nil { + return nil, nil, err + } + + return defs, defsJSON, nil +} + +func (c *Config) commitRoleDefinitions(defs map[string]models.Role, defsJSON []byte) error { + c.mu.Lock() + defer c.mu.Unlock() + + currentJSON, err := marshalJSON(c.Roles.Definitions) + if err != nil { + return err + } + if bytes.Equal(currentJSON, defsJSON) { + return nil + } + + c.Roles.Definitions = defs + c.configGeneration++ + return nil +} + +func (c *Config) commitWorkflowDefinitions(defs map[string]models.Workflow, defsJSON []byte) error { + c.mu.Lock() + defer c.mu.Unlock() + + currentJSON, err := marshalJSON(c.Workflows.Definitions) + if err != nil { + return err + } + if bytes.Equal(currentJSON, defsJSON) { + return nil + } + + c.Workflows.Definitions = defs + c.configGeneration++ + return nil +} + +func (c *Config) commitProviderDefinitions(defs map[string]models.ProviderConfig, defsJSON []byte) error { + c.mu.Lock() + defer c.mu.Unlock() + + currentJSON, err := marshalJSON(c.Providers.Definitions) + if err != nil { + return err + } + if bytes.Equal(currentJSON, defsJSON) { + return nil + } + + c.Providers.Definitions = defs + c.configGeneration++ + return nil +} + +func detachedRoleDefinitions(config *RoleConfig) map[string]models.Role { + if config == nil { + return nil + } + return config.Definitions +} + +func detachedWorkflowDefinitions(config *WorkflowConfig) map[string]models.Workflow { + if config == nil { + return nil + } + return config.Definitions +} + +func detachedProviderDefinitions(config *ProviderDefinitionsConfig) map[string]models.ProviderConfig { + if config == nil { + return nil + } + return config.Definitions } diff --git a/internal/config/sync_test.go b/internal/config/sync_test.go index b40e88e1..5d7b9a98 100644 --- a/internal/config/sync_test.go +++ b/internal/config/sync_test.go @@ -10,6 +10,8 @@ import ( "time" jsonpatch "github.com/evanphx/json-patch" + "github.com/hashicorp/go-version" + "github.com/serverlessworkflow/sdk-go/v3/model" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/thand-io/agent/internal/models" @@ -85,6 +87,7 @@ func newSyncTestConfig( Thand: models.ThandConfig{ Endpoint: endpoint, ApiKey: "test-api-key", + Sync: true, }, } @@ -116,6 +119,46 @@ func makeRegistrationResponse( return resp } +func makeTestWorkflow(name, description string) models.Workflow { + return models.Workflow{ + Name: name, + Description: description, + Enabled: true, + Workflow: &model.Workflow{ + Do: &model.TaskList{}, + }, + } +} + +func makeNormalizedRole(name, description string) models.Role { + return models.Role{ + Version: version.Must(version.NewVersion("1.0")), + Identifier: name, + Name: name, + Description: description, + Enabled: true, + } +} + +func makeTestProvider(name, description string) models.ProviderConfig { + return models.ProviderConfig{ + Name: name, + Description: description, + Provider: "mock", + Enabled: true, + } +} + +func makeTestProviderWithConfig(name, description string, config map[string]any) models.ProviderConfig { + provider := makeTestProvider(name, description) + basicConfig := models.BasicConfig{} + for key, value := range config { + basicConfig[key] = value + } + provider.Config = &basicConfig + return provider +} + // waitForPatch waits for a single PATCH call on the channel, or times out. func waitForPatch(ch <-chan syncPatchCall, timeout time.Duration) (syncPatchCall, bool) { select { @@ -126,6 +169,16 @@ func waitForPatch(ch <-chan syncPatchCall, timeout time.Duration) (syncPatchCall } } +func assertNoPatch(t *testing.T, ch <-chan syncPatchCall, timeout time.Duration) { + t.Helper() + + select { + case call := <-ch: + t.Fatalf("expected no outgoing PATCH call, got %s %s", call.Method, call.URL) + case <-time.After(timeout): + } +} + // --------------------------------------------------------------------------- // Tests for MergeConfiguration — incoming config merging // --------------------------------------------------------------------------- @@ -147,6 +200,11 @@ func TestMergeConfiguration_ServerSendsNewRoles(t *testing.T) { err := config.MergeConfiguration(reg) require.NoError(t, err) + role, exists := config.Roles.Definitions["admin"] + require.True(t, exists, "expected synced role to be stored locally") + assert.Equal(t, "admin", role.Name) + assert.True(t, role.Enabled) + _, ok := waitForPatch(patchCh, 5*time.Second) require.True(t, ok, "expected outgoing PATCH call") } @@ -157,7 +215,8 @@ func TestMergeConfiguration_ServerSendsUpdatedRole(t *testing.T) { // Local config has an existing role config := newSyncTestConfig(t, map[string]models.Role{ - "editor": {Name: "editor", Description: "Can edit", Enabled: true}, + "editor": {Name: "editor", Description: "Can edit", Enabled: true}, + "untouched": {Name: "untouched", Description: "Keep me", Enabled: true}, }, nil, nil, server.URL, ) @@ -173,6 +232,128 @@ func TestMergeConfiguration_ServerSendsUpdatedRole(t *testing.T) { err := config.MergeConfiguration(reg) require.NoError(t, err) + role, exists := config.Roles.Definitions["editor"] + require.True(t, exists, "expected synced role to be stored locally") + assert.Equal(t, "Can edit and publish", role.Description) + assert.True(t, role.Enabled) + assert.Contains(t, config.Roles.Definitions, "untouched") + + _, ok := waitForPatch(patchCh, 5*time.Second) + require.True(t, ok, "expected outgoing PATCH call") +} + +func TestMergeConfiguration_ServerSendsNewWorkflows(t *testing.T) { + server, patchCh := newSyncTestServer(t) + + config := newSyncTestConfig(t, nil, nil, nil, server.URL) + + reg := makeRegistrationResponse( + nil, + map[string]models.Workflow{ + "approval": makeTestWorkflow("approval", "Handles approvals"), + }, + nil, + ) + + err := config.MergeConfiguration(reg) + require.NoError(t, err) + + workflow, exists := config.Workflows.Definitions["approval"] + require.True(t, exists, "expected synced workflow to be stored locally") + assert.Equal(t, "Handles approvals", workflow.Description) + require.NotNil(t, workflow.Workflow) + + _, ok := waitForPatch(patchCh, 5*time.Second) + require.True(t, ok, "expected outgoing PATCH call") +} + +func TestMergeConfiguration_ServerSendsUpdatedWorkflow(t *testing.T) { + server, patchCh := newSyncTestServer(t) + + config := newSyncTestConfig(t, + nil, + map[string]models.Workflow{ + "existing": makeTestWorkflow("existing", "Existing workflow"), + "unchanged": makeTestWorkflow("unchanged", "Keep me"), + }, + nil, + server.URL, + ) + + reg := makeRegistrationResponse( + nil, + map[string]models.Workflow{ + "existing": makeTestWorkflow("existing", "Updated workflow"), + }, + nil, + ) + + err := config.MergeConfiguration(reg) + require.NoError(t, err) + + workflow, exists := config.Workflows.Definitions["existing"] + require.True(t, exists, "expected synced workflow to be stored locally") + assert.Equal(t, "Updated workflow", workflow.Description) + assert.Contains(t, config.Workflows.Definitions, "unchanged") + + _, ok := waitForPatch(patchCh, 5*time.Second) + require.True(t, ok, "expected outgoing PATCH call") +} + +func TestMergeConfiguration_ServerSendsNewProviders(t *testing.T) { + server, patchCh := newSyncTestServer(t) + + config := newSyncTestConfig(t, nil, nil, nil, server.URL) + + reg := makeRegistrationResponse( + nil, + nil, + map[string]models.ProviderConfig{ + "mock-primary": makeTestProvider("mock-primary", "Primary mock provider"), + }, + ) + + err := config.MergeConfiguration(reg) + require.NoError(t, err) + + provider, exists := config.Providers.Definitions["mock-primary"] + require.True(t, exists, "expected synced provider to be stored locally") + assert.Equal(t, "Primary mock provider", provider.Description) + assert.Equal(t, "mock", provider.Provider) + + _, ok := waitForPatch(patchCh, 5*time.Second) + require.True(t, ok, "expected outgoing PATCH call") +} + +func TestMergeConfiguration_ServerSendsUpdatedProvider(t *testing.T) { + server, patchCh := newSyncTestServer(t) + + config := newSyncTestConfig(t, + nil, + nil, + map[string]models.ProviderConfig{ + "mock-primary": makeTestProvider("mock-primary", "Old provider description"), + "mock-extra": makeTestProvider("mock-extra", "Keep me"), + }, + server.URL, + ) + + reg := makeRegistrationResponse( + nil, + nil, + map[string]models.ProviderConfig{ + "mock-primary": makeTestProvider("mock-primary", "Updated provider description"), + }, + ) + + err := config.MergeConfiguration(reg) + require.NoError(t, err) + + provider, exists := config.Providers.Definitions["mock-primary"] + require.True(t, exists, "expected synced provider to be stored locally") + assert.Equal(t, "Updated provider description", provider.Description) + assert.Contains(t, config.Providers.Definitions, "mock-extra") + _, ok := waitForPatch(patchCh, 5*time.Second) require.True(t, ok, "expected outgoing PATCH call") } @@ -181,7 +362,7 @@ func TestMergeConfiguration_IdenticalConfigs(t *testing.T) { server, patchCh := newSyncTestServer(t) roles := map[string]models.Role{ - "viewer": {Name: "viewer", Description: "Read-only", Enabled: true}, + "viewer": makeNormalizedRole("viewer", "Read-only"), } config := newSyncTestConfig(t, roles, nil, nil, server.URL) @@ -191,12 +372,41 @@ func TestMergeConfiguration_IdenticalConfigs(t *testing.T) { err := config.MergeConfiguration(reg) require.NoError(t, err) + assert.Equal(t, uint64(0), config.configGeneration, "identical sync should not advance the generation") // The outgoing goroutine still fires (with an empty or no-op patch) _, ok := waitForPatch(patchCh, 5*time.Second) require.True(t, ok, "expected outgoing PATCH call") } +func TestMergeConfiguration_MetadataOnlyRoleChangesAreIgnored(t *testing.T) { + server, patchCh := newSyncTestServer(t) + + config := newSyncTestConfig(t, + map[string]models.Role{ + "existing": makeNormalizedRole("existing", "same"), + }, + nil, nil, server.URL, + ) + config.Roles.Path = "./local-roles" + + reg := makeRegistrationResponse( + map[string]models.Role{ + "existing": makeNormalizedRole("existing", "same"), + }, + nil, nil, + ) + reg.Roles.Path = "./remote-roles" + + err := config.MergeConfiguration(reg) + require.NoError(t, err) + assert.Equal(t, uint64(0), config.configGeneration, "metadata-only sync changes should not advance generation") + assert.Equal(t, "./local-roles", config.Roles.Path, "definitions-only sync should not rewrite role metadata") + + _, ok := waitForPatch(patchCh, 5*time.Second) + require.True(t, ok, "expected outgoing PATCH call") +} + func TestMergeConfiguration_PartialConfig_OnlyRoles(t *testing.T) { server, patchCh := newSyncTestServer(t) @@ -218,6 +428,11 @@ func TestMergeConfiguration_PartialConfig_OnlyRoles(t *testing.T) { err := config.MergeConfiguration(reg) require.NoError(t, err) + role, exists := config.Roles.Definitions["new-role"] + require.True(t, exists, "expected synced role to be stored locally") + assert.Equal(t, "new-role", role.Name) + assert.Contains(t, config.Roles.Definitions, "existing") + _, ok := waitForPatch(patchCh, 5*time.Second) require.True(t, ok, "expected outgoing PATCH call") } @@ -436,6 +651,34 @@ func TestMergeConfiguration_OutgoingPatch_URLContainsSync(t *testing.T) { assert.Contains(t, call.URL, "/sync") } +func TestMergeConfiguration_NoOutgoingPatchWithoutThandService(t *testing.T) { + server, patchCh := newSyncTestServer(t) + + config := newSyncTestConfig(t, nil, nil, nil, server.URL) + config.Thand.ApiKey = "" + + reg := makeRegistrationResponse(nil, nil, nil) + + err := config.MergeConfiguration(reg) + require.NoError(t, err) + + assertNoPatch(t, patchCh, 300*time.Millisecond) +} + +func TestMergeConfiguration_NoOutgoingPatchWhenThandSyncDisabled(t *testing.T) { + server, patchCh := newSyncTestServer(t) + + config := newSyncTestConfig(t, nil, nil, nil, server.URL) + config.Thand.Sync = false + + reg := makeRegistrationResponse(nil, nil, nil) + + err := config.MergeConfiguration(reg) + require.NoError(t, err) + + assertNoPatch(t, patchCh, 300*time.Millisecond) +} + // --------------------------------------------------------------------------- // Tests for applyPatch // --------------------------------------------------------------------------- @@ -461,6 +704,29 @@ func TestApplyPatch_AppliesRoles(t *testing.T) { err := config.applyPatch(diff) assert.NoError(t, err) + assert.Contains(t, config.Roles.Definitions, "new-role") + assert.Equal(t, uint64(1), config.configGeneration) +} + +func TestApplyPatch_IdenticalRolesDoNotAdvanceGeneration(t *testing.T) { + config := newSyncTestConfig(t, + map[string]models.Role{ + "existing": makeNormalizedRole("existing", "same"), + }, + nil, nil, "", + ) + + diff := ConfigPatchRequest{ + RoleConfig: &RoleConfig{ + Definitions: map[string]models.Role{ + "existing": makeNormalizedRole("existing", "same"), + }, + }, + } + + err := config.applyPatch(diff) + require.NoError(t, err) + assert.Equal(t, uint64(0), config.configGeneration, "no-op apply should not advance generation") } func TestApplyPatch_SkipsNilWorkflows(t *testing.T) { @@ -484,6 +750,208 @@ func TestApplyPatch_SkipsNilWorkflows(t *testing.T) { assert.NoError(t, err) } +func TestApplyMergedConfigWithSnapshot_RejectsStaleGeneration(t *testing.T) { + config := newSyncTestConfig(t, + map[string]models.Role{ + "existing": makeNormalizedRole("existing", "before"), + }, + nil, nil, "", + ) + + snapshot, err := config.snapshotConfigPatch() + require.NoError(t, err) + + config.mu.Lock() + config.configGeneration++ + config.mu.Unlock() + + applied, err := config.applyMergedConfigWithSnapshot(snapshot, ConfigPatchRequest{ + RoleConfig: &RoleConfig{ + Definitions: map[string]models.Role{ + "existing": makeNormalizedRole("existing", "after"), + }, + }, + }) + require.NoError(t, err) + assert.False(t, applied, "expected generation mismatch to reject the stale merged apply") + assert.Equal(t, "before", config.Roles.Definitions["existing"].Description) +} + +func TestApplyMergedConfigWithRetries_RetriesAndSucceeds(t *testing.T) { + config := newSyncTestConfig(t, + map[string]models.Role{ + "existing": makeNormalizedRole("existing", "before"), + }, + nil, nil, "", + ) + + attempts := 0 + outgoingPatch, err := config.applyMergedConfigWithRetries(func(snapshot *configPatchSnapshot) (*buildMergedConfigResult, error) { + attempts++ + + if attempts == 1 { + config.mu.Lock() + config.configGeneration++ + config.mu.Unlock() + } + + return &buildMergedConfigResult{ + config: ConfigPatchRequest{ + RoleConfig: &RoleConfig{ + Definitions: map[string]models.Role{ + "existing": makeNormalizedRole("existing", "after"), + }, + }, + }, + outgoingPatch: []byte(`{"roles":{}}`), + }, nil + }) + require.NoError(t, err) + assert.Equal(t, 2, attempts, "expected one retry before success") + assert.JSONEq(t, `{"roles":{}}`, string(outgoingPatch)) + assert.Equal(t, "after", config.Roles.Definitions["existing"].Description) + assert.Equal(t, uint64(2), config.configGeneration) +} + +func TestSnapshotConfigPatch_DetachesRoleSlices(t *testing.T) { + config := newSyncTestConfig(t, + map[string]models.Role{ + "editor": { + Name: "editor", + Providers: []string{"aws-prod"}, + Permissions: models.RolePermissions{ + Allow: models.RoleStatements{{ + Operations: []string{"s3:GetObject"}, + }}, + }, + Enabled: true, + }, + }, + nil, nil, "", + ) + + snapshot, err := config.snapshotConfigPatch() + require.NoError(t, err) + + role := config.Roles.Definitions["editor"] + role.Providers[0] = "gcp-prod" + role.Permissions.Allow[0].Operations[0] = "storage.objects.get" + + snapRole, exists := snapshot.request.RoleConfig.Definitions["editor"] + require.True(t, exists) + assert.Equal(t, "aws-prod", snapRole.Providers[0]) + assert.Equal(t, "s3:GetObject", snapRole.Permissions.Allow[0].Operations[0]) +} + +func TestSnapshotConfigPatch_DetachesWorkflowDefinitions(t *testing.T) { + config := newSyncTestConfig(t, + nil, + map[string]models.Workflow{ + "approval": makeTestWorkflow("approval", "original"), + }, + nil, "", + ) + + snapshot, err := config.snapshotConfigPatch() + require.NoError(t, err) + + workflow := config.Workflows.Definitions["approval"] + workflow.Workflow.Do = nil + + snapWorkflow, exists := snapshot.request.WorkflowConfig.Definitions["approval"] + require.True(t, exists) + require.NotNil(t, snapWorkflow.Workflow) + assert.NotNil(t, snapWorkflow.Workflow.Do) +} + +func TestSnapshotConfigPatch_DetachesProviderConfig(t *testing.T) { + config := &Config{ + Providers: ProviderDefinitionsConfig{ + Definitions: map[string]models.ProviderConfig{ + "mock-primary": makeTestProviderWithConfig("mock-primary", "primary", map[string]any{ + "region": "us-east-1", + }), + }, + }, + } + + snapshot, err := config.snapshotConfigPatch() + require.NoError(t, err) + + provider := config.Providers.Definitions["mock-primary"] + require.NotNil(t, provider.Config) + provider.Config.SetKeyWithValue("region", "eu-west-1") + + snapProvider, exists := snapshot.request.ProviderConfig.Definitions["mock-primary"] + require.True(t, exists) + require.NotNil(t, snapProvider.Config) + region, ok := snapProvider.Config.GetString("region") + require.True(t, ok) + assert.Equal(t, "us-east-1", region) +} + +func TestKnownGap_NestedProviderMutationsShouldAdvanceConfigGeneration(t *testing.T) { + t.Skip("expected failure until #306: nested provider config mutations bypass configGeneration") + + config := &Config{ + Providers: ProviderDefinitionsConfig{ + Definitions: map[string]models.ProviderConfig{ + "mock-primary": makeTestProviderWithConfig("mock-primary", "primary", map[string]any{ + "region": "us-east-1", + }), + }, + }, + } + + provider := config.Providers.Definitions["mock-primary"] + require.NotNil(t, provider.Config) + provider.Config.SetKeyWithValue("region", "eu-west-1") + + assert.Equal(t, uint64(1), config.configGeneration, "nested provider mutations should participate in generation tracking") +} + +func TestKnownGap_NestedRoleMutationsShouldAdvanceConfigGeneration(t *testing.T) { + t.Skip("expected failure until #306: nested role mutations bypass configGeneration") + + config := newSyncTestConfig(t, + map[string]models.Role{ + "editor": { + Name: "editor", + Providers: []string{"aws-prod"}, + Permissions: models.RolePermissions{ + Allow: models.RoleStatements{{ + Operations: []string{"s3:GetObject"}, + }}, + }, + Enabled: true, + }, + }, + nil, nil, "", + ) + + role := config.Roles.Definitions["editor"] + role.Permissions.Allow[0].Operations[0] = "storage.objects.get" + + assert.Equal(t, uint64(1), config.configGeneration, "nested role mutations should participate in generation tracking") +} + +func TestKnownGap_NestedWorkflowMutationsShouldAdvanceConfigGeneration(t *testing.T) { + t.Skip("expected failure until #306: nested workflow mutations bypass configGeneration") + + config := newSyncTestConfig(t, + nil, + map[string]models.Workflow{ + "approval": makeTestWorkflow("approval", "original"), + }, + nil, "", + ) + + workflow := config.Workflows.Definitions["approval"] + workflow.Workflow.Do = nil + + assert.Equal(t, uint64(1), config.configGeneration, "nested workflow mutations should participate in generation tracking") +} + // --------------------------------------------------------------------------- // Tests for the merge-patch logic (JSON diffing correctness) // --------------------------------------------------------------------------- diff --git a/internal/config/temporal.go b/internal/config/temporal.go index 5f4d4777..edac0bef 100644 --- a/internal/config/temporal.go +++ b/internal/config/temporal.go @@ -6,6 +6,7 @@ import ( "github.com/sirupsen/logrus" "github.com/thand-io/agent/internal/models" "go.temporal.io/sdk/activity" + "go.temporal.io/sdk/workflow" ) // Register temporal workflows and activities @@ -14,12 +15,32 @@ func (c *Config) registerTemporalWorkflows() error { return fmt.Errorf("temporal service is not initialized") } - temporalWorker := c.servicesClient.GetTemporal().GetWorker() + if !c.IsServer() { + return nil + } - if temporalWorker == nil { - return fmt.Errorf("temporal worker is not initialized") + // Registry singletons live on the shared device-registry queue rather than + // the per-server operational queue. + registryWorker := c.getDeviceRegistryWorker() + if registryWorker == nil { + return fmt.Errorf("device registry worker is not initialized") } + registryWorker.RegisterWorkflowWithOptions( + deviceRouteRegistryWorkflow, + workflow.RegisterOptions{ + Name: models.TemporalDeviceRouteRegistryWorkflowName, + VersioningBehavior: workflow.VersioningBehaviorAutoUpgrade, + }, + ) + registryWorker.RegisterWorkflowWithOptions( + deviceDefinitionRegistryWorkflow, + workflow.RegisterOptions{ + Name: models.TemporalDeviceDefinitionRegistryWorkflowName, + VersioningBehavior: workflow.VersioningBehaviorAutoUpgrade, + }, + ) + return nil } @@ -29,7 +50,7 @@ func (c *Config) registerTemporalActivities() error { return fmt.Errorf("temporal service is not initialized") } - temporalWorker := c.servicesClient.GetTemporal().GetWorker() + temporalWorker := c.getOperationalTemporalWorker() if temporalWorker == nil { return fmt.Errorf("temporal worker is not initialized") @@ -62,6 +83,19 @@ func (c *Config) registerTemporalActivities() error { ) } + temporalWorker.RegisterActivityWithOptions( + thandActivities.ResolveFreshDeviceRoute, + activity.RegisterOptions{ + Name: models.TemporalResolveFreshDeviceRouteActivityName, + }, + ) + temporalWorker.RegisterActivityWithOptions( + thandActivities.BuildExecutionPlan, + activity.RegisterOptions{ + Name: models.TemporalBuildExecutionPlanActivityName, + }, + ) + return nil } diff --git a/internal/config/temporal_activities.go b/internal/config/temporal_activities.go index 43df3169..c755ab32 100644 --- a/internal/config/temporal_activities.go +++ b/internal/config/temporal_activities.go @@ -2,18 +2,22 @@ package config import ( "context" + "errors" "fmt" "strings" "github.com/serverlessworkflow/sdk-go/v3/model" "github.com/sirupsen/logrus" "github.com/thand-io/agent/internal/models" + "go.temporal.io/api/enums/v1" "go.temporal.io/sdk/activity" + "go.temporal.io/sdk/client" "go.temporal.io/sdk/temporal" ) type thandActivities struct { - config *Config + config *Config + lookupDeviceDefinition func(ctx context.Context, deviceID string) (*models.Device, error) } // PatchProviderUpstreamDummy is a no-op activity for thand server/agents that are not @@ -85,3 +89,94 @@ func (t *thandActivities) PatchProviderUpstream( return err } + +func (t *thandActivities) ResolveFreshDeviceRoute( + ctx context.Context, + deviceID string, +) (*models.DeviceConnectionState, error) { + route, err := t.queryFreshDeviceRoute(ctx, deviceID) + if err == nil { + return route, nil + } + if errors.Is(err, ErrDeviceRouteUnavailable) { + return nil, temporal.NewNonRetryableApplicationError( + err.Error(), + "DeviceRouteUnavailable", + err, + ) + } + return nil, err +} + +func (t *thandActivities) BuildExecutionPlan( + ctx context.Context, + req models.ExecutionPlanRequest, +) (*models.ExecutionPlan, error) { + if req.ElevateRequest == nil { + return nil, temporal.NewNonRetryableApplicationError( + "elevate request is required for execution planning", + "ExecutionPlanInvalid", + nil, + ) + } + + plan, err := BuildExecutionPlanWithOptions(t.config, req.WorkflowID, req.ElevateRequest, executionPlanBuildOptions{ + LookupDeviceDefinition: func(deviceID string) (*models.Device, error) { + if t.lookupDeviceDefinition != nil { + return t.lookupDeviceDefinition(ctx, deviceID) + } + return t.config.querySharedDeviceDefinition(ctx, deviceID) + }, + }) + if err == nil { + return plan, nil + } + + return nil, temporal.NewNonRetryableApplicationError( + err.Error(), + "ExecutionPlanInvalid", + err, + ) +} + +func (t *thandActivities) queryFreshDeviceRoute( + ctx context.Context, + deviceID string, +) (*models.DeviceConnectionState, error) { + services := t.config.GetServices() + if services == nil || !services.HasTemporal() { + return t.config.GetFreshDeviceRoute(deviceID) + } + + temporalService := services.GetTemporal() + if temporalService == nil || !temporalService.HasClient() { + return t.config.GetFreshDeviceRoute(deviceID) + } + + timeoutCtx, cancel := context.WithTimeout(ctx, deviceRouteRefreshInterval) + defer cancel() + + queryResponse, err := temporalService.GetClient().QueryWorkflowWithOptions(timeoutCtx, &client.QueryWorkflowWithOptionsRequest{ + WorkflowID: models.TemporalDeviceRouteRegistryWorkflowID, + RunID: "", + QueryType: models.TemporalGetDeviceRouteQueryName, + QueryRejectCondition: enums.QUERY_REJECT_CONDITION_NOT_OPEN, + Args: []any{deviceID}, + }) + if err != nil { + return nil, fmt.Errorf("%w: device %q is not connected", ErrDeviceRouteUnavailable, strings.TrimSpace(deviceID)) + } + if queryResponse == nil || queryResponse.QueryResult == nil { + return nil, fmt.Errorf("%w: device %q is not connected", ErrDeviceRouteUnavailable, strings.TrimSpace(deviceID)) + } + + var route models.DeviceConnectionState + if err := queryResponse.QueryResult.Get(&route); err != nil { + return nil, err + } + if !route.Connected || strings.TrimSpace(route.TaskQueue) == "" { + return nil, fmt.Errorf("%w: device %q is not connected", ErrDeviceRouteUnavailable, strings.TrimSpace(deviceID)) + } + + return &route, nil +} diff --git a/internal/config/temporal_workers.go b/internal/config/temporal_workers.go new file mode 100644 index 00000000..0d8638bb --- /dev/null +++ b/internal/config/temporal_workers.go @@ -0,0 +1,103 @@ +package config + +import ( + "github.com/thand-io/agent/internal/models" + "go.temporal.io/sdk/client" + "go.temporal.io/sdk/worker" +) + +// temporalWorkerScope narrows a Temporal service view to a specific worker set. +// In server mode we run both the operational worker queue and the shared device +// registry queue, and this wrapper keeps ordinary workflow/activity/provider +// registrations from accidentally landing on the registry worker. +type temporalWorkerScope struct { + base models.TemporalImpl + workerIDs []string +} + +func (t *temporalWorkerScope) Initialize() error { + return t.base.Initialize() +} + +func (t *temporalWorkerScope) Shutdown() error { + return t.base.Shutdown() +} + +func (t *temporalWorkerScope) StartWorkers() error { + return t.base.StartWorkers() +} + +func (t *temporalWorkerScope) GetClient() client.Client { + return t.base.GetClient() +} + +func (t *temporalWorkerScope) HasClient() bool { + return t.base.HasClient() +} + +func (t *temporalWorkerScope) GetWorker(identities ...string) worker.Worker { + if len(identities) == 0 { + identities = t.workerIDs + } + return t.base.GetWorker(identities...) +} + +func (t *temporalWorkerScope) HasWorker() bool { + return t.base.HasWorker() +} + +func (t *temporalWorkerScope) GetHostPort() string { + return t.base.GetHostPort() +} + +func (t *temporalWorkerScope) GetNamespace() string { + return t.base.GetNamespace() +} + +func (t *temporalWorkerScope) GetTaskQueue() string { + return t.base.GetTaskQueue() +} + +func (t *temporalWorkerScope) IsVersioningDisabled() bool { + return t.base.IsVersioningDisabled() +} + +// getOperationalTemporalWorker returns the worker that should own normal server +// workflows and activities. Device-registry singletons are registered on a +// separate shared queue. +func (c *Config) getOperationalTemporalWorker() worker.Worker { + temporalService := c.servicesClient.GetTemporal() + if temporalService == nil { + return nil + } + if c.IsServer() { + return temporalService.GetWorker(temporalService.GetTaskQueue()) + } + return temporalService.GetWorker() +} + +// getOperationalTemporalService scopes provider workflow/activity registration +// away from the shared device-registry queue in server mode. +func (c *Config) getOperationalTemporalService() models.TemporalImpl { + temporalService := c.servicesClient.GetTemporal() + if temporalService == nil { + return nil + } + if c.IsServer() { + return &temporalWorkerScope{ + base: temporalService, + workerIDs: []string{temporalService.GetTaskQueue()}, + } + } + return temporalService +} + +// getDeviceRegistryWorker returns the shared worker that owns device registry +// singleton workflows across servers. +func (c *Config) getDeviceRegistryWorker() worker.Worker { + temporalService := c.servicesClient.GetTemporal() + if temporalService == nil { + return nil + } + return temporalService.GetWorker(models.TemporalDeviceRegistryTaskQueue) +} diff --git a/internal/daemon/elevate.go b/internal/daemon/elevate.go index 3714a7b9..36fdb992 100644 --- a/internal/daemon/elevate.go +++ b/internal/daemon/elevate.go @@ -20,7 +20,7 @@ import ( "github.com/thand-io/agent/internal/workflows/manager" ) -// getElevate handles GET /api/v1/elevate?role=admin&target=server&reason=maintenance +// getElevate handles GET /api/v1/elevate?role=admin&device=&reason=maintenance // // @Summary Request role elevation // @Description Request elevation to a specific role with static parameters @@ -29,6 +29,7 @@ import ( // @Produce json // @Param role query string true "Role name" // @Param provider query string true "Provider name" +// @Param device query string false "Canonical device_id for device-local workflows" // @Param reason query string true "Reason for elevation" // @Param duration query string false "Duration of elevation" // @Param workflow query string false "Workflow name" @@ -72,6 +73,7 @@ func (s *Server) getElevate(c *gin.Context) { Providers: []string{request.Provider}, Identities: request.Identities, Workflow: primaryWorkflow, + Device: request.Device, Reason: request.Reason, Duration: request.Duration, Session: request.Session, @@ -671,6 +673,7 @@ type ElevateStaticPageData struct { Identities []models.Identity `json:"identities"` Providers []string `json:"providers"` Roles []string `json:"roles"` + Device string `json:"device"` Duration string `json:"duration"` Reason string `json:"reason"` Tenants []string `json:"tenants"` @@ -682,6 +685,9 @@ func (s *Server) getElevationPagePrefill(c *gin.Context) ElevateStaticPageData { } preFilledTenants := c.QueryArray("tenants") + if len(preFilledTenants) == 0 { + preFilledTenants = c.QueryArray("tenant") + } validTenants := []string{} for _, tenantID := range preFilledTenants { tenant, err := s.Config.GetTenant(tenantID) @@ -729,6 +735,8 @@ func (s *Server) getElevationPagePrefill(c *gin.Context) ElevateStaticPageData { data.Roles = roles } + data.Device = strings.TrimSpace(c.Query("device")) + // Get duration from query parameters duration := c.Query("duration") if len(duration) > 0 { diff --git a/internal/daemon/elevate_prefill_test.go b/internal/daemon/elevate_prefill_test.go new file mode 100644 index 00000000..bcfa6988 --- /dev/null +++ b/internal/daemon/elevate_prefill_test.go @@ -0,0 +1,31 @@ +package daemon + +import ( + "net/http/httptest" + "testing" + + "github.com/gin-gonic/gin" + "github.com/stretchr/testify/assert" + "github.com/thand-io/agent/internal/config" +) + +func TestGetElevationPagePrefillIncludesDevice(t *testing.T) { + gin.SetMode(gin.TestMode) + + recorder := httptest.NewRecorder() + ctx, _ := gin.CreateTestContext(recorder) + ctx.Request = httptest.NewRequest( + "GET", + "/elevate/static?provider=local-elevation&role=local_sudo&device=device-alpha&duration=1h&reason=test", + nil, + ) + + server := NewServer(config.DefaultConfig()) + data := server.getElevationPagePrefill(ctx) + + assert.Equal(t, []string{"local-elevation"}, data.Providers) + assert.Equal(t, []string{"local_sudo"}, data.Roles) + assert.Equal(t, "device-alpha", data.Device) + assert.Equal(t, "1h", data.Duration) + assert.Equal(t, "test", data.Reason) +} diff --git a/internal/daemon/register.go b/internal/daemon/register.go index 5a5d64a0..aa501bef 100644 --- a/internal/daemon/register.go +++ b/internal/daemon/register.go @@ -37,11 +37,11 @@ func (s *Server) postRegister(c *gin.Context) { cfg := s.GetConfig() c.JSON(http.StatusOK, config.RegistrationResponse{ - Success: true, - Services: &cfg.Services, - //Roles: &cfg.Roles, - //Providers: &cfg.Providers, - //Workflows: &cfg.Workflows, + Success: true, + Services: &cfg.Services, + Roles: &cfg.Roles, + Providers: &cfg.Providers, + Workflows: &cfg.Workflows, }) } diff --git a/internal/daemon/register_test.go b/internal/daemon/register_test.go new file mode 100644 index 00000000..5aec3a4a --- /dev/null +++ b/internal/daemon/register_test.go @@ -0,0 +1,149 @@ +package daemon + +import ( + "bytes" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/gin-gonic/gin" + "github.com/google/uuid" + "github.com/serverlessworkflow/sdk-go/v3/model" + "github.com/thand-io/agent/internal/config" + "github.com/thand-io/agent/internal/models" +) + +func TestPostRegisterReturnsConfigurationDefinitions(t *testing.T) { + t.Parallel() + + gin.SetMode(gin.TestMode) + + cfg := &config.Config{ + Roles: config.RoleConfig{ + Definitions: map[string]models.Role{ + "viewer": { + Name: "Viewer", + Description: "Read-only access", + Enabled: true, + }, + }, + }, + Workflows: config.WorkflowConfig{ + Definitions: map[string]models.Workflow{ + "approval": { + Name: "Approval", + Description: "Approval workflow", + Enabled: true, + Workflow: &model.Workflow{ + Do: &model.TaskList{}, + }, + }, + }, + }, + Providers: config.ProviderDefinitionsConfig{ + Definitions: map[string]models.ProviderConfig{ + "oauth2-directory": { + Name: "Directory Login", + Description: "Remote OAuth2 provider", + Provider: "oauth2", + Enabled: true, + }, + }, + }, + } + + server := NewServer(cfg) + router := gin.New() + router.POST("/register", server.postRegister) + + body, err := json.Marshal(config.RegistrationRequest{ + Identifier: uuid.New(), + Environment: &models.EnvironmentConfig{ + Name: "device-alpha", + Hostname: "device-alpha.example.test", + Platform: models.Local, + }, + }) + if err != nil { + t.Fatalf("Marshal registration request: %v", err) + } + + req := httptest.NewRequest(http.MethodPost, "/register", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + router.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK) + } + + var resp config.RegistrationResponse + if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil { + t.Fatalf("Unmarshal response: %v", err) + } + + if resp.Providers == nil || resp.Providers.Definitions["oauth2-directory"].Provider != "oauth2" { + t.Fatalf("response providers missing expected definition: %#v", resp.Providers) + } + if resp.Roles == nil || resp.Roles.Definitions["viewer"].Name != "Viewer" { + t.Fatalf("response roles missing expected definition: %#v", resp.Roles) + } + if resp.Workflows == nil || resp.Workflows.Definitions["approval"].Name != "Approval" { + t.Fatalf("response workflows missing expected definition: %#v", resp.Workflows) + } +} + +func TestPostRegisterOmitsDeviceData(t *testing.T) { + t.Parallel() + + gin.SetMode(gin.TestMode) + + deviceID := uuid.NewString() + cfg := &config.Config{ + Devices: config.DeviceDefinitionsConfig{ + Definitions: map[string]models.Device{ + "workstation-alpha": { + ID: deviceID, + Name: "Workstation Alpha", + Enabled: true, + }, + }, + }, + } + + server := NewServer(cfg) + router := gin.New() + router.POST("/register", server.postRegister) + + body, err := json.Marshal(config.RegistrationRequest{ + Mode: config.ModeAgent, + Identifier: uuid.MustParse(deviceID), + Environment: &models.EnvironmentConfig{ + Name: "alpha", + Hostname: "alpha.example.test", + Platform: models.Local, + }, + }) + if err != nil { + t.Fatalf("Marshal registration request: %v", err) + } + + req := httptest.NewRequest(http.MethodPost, "/register", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + router.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Fatalf("status = %d, want %d", rec.Code, http.StatusOK) + } + + var resp map[string]any + if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil { + t.Fatalf("Unmarshal response: %v", err) + } + + if _, found := resp["device"]; found { + t.Fatalf("expected registration response to omit device data, got %#v", resp["device"]) + } +} diff --git a/internal/daemon/server.go b/internal/daemon/server.go index d15bb220..a9239bb8 100644 --- a/internal/daemon/server.go +++ b/internal/daemon/server.go @@ -620,7 +620,7 @@ func (s *Server) apiConfigurationHandler(c *gin.Context) { workflows := []string{} // TODO: populate workflows list activities := []string{} // TODO: populate activities list - // For agent / client we show the local server for discvoery. + // For agent / client we show the local server for discovery. baseUrl := s.Config.GetLocalServerUrl() if s.Config.IsServer() { @@ -633,9 +633,14 @@ func (s *Server) apiConfigurationHandler(c *gin.Context) { capabilities["llm"] = services.HasLargeLanguageModel() capabilities["storage"] = services.HasStorage() - // However, for server we show the login server as the main - // entry point for clients to connect to - baseUrl = s.Config.GetLoginServerUrl() + // Prefer the origin the caller actually used so discovery stays + // reachable across local test hostnames and reverse proxies. + if requestBaseURL := getRequestBaseURL(c.Request); len(requestBaseURL) > 0 { + baseUrl = requestBaseURL + } else { + // Fall back to the configured login server entry point. + baseUrl = s.Config.GetLoginServerUrl() + } } response := gin.H{ @@ -674,6 +679,30 @@ func (s *Server) apiConfigurationHandler(c *gin.Context) { c.JSON(http.StatusOK, response) } +func getRequestBaseURL(req *http.Request) string { + if req == nil { + return "" + } + + host := strings.TrimSpace(req.Host) + if len(host) == 0 && req.URL != nil { + host = strings.TrimSpace(req.URL.Host) + } + if len(host) == 0 { + return "" + } + + scheme := "http" + if req.TLS != nil { + scheme = "https" + } + if forwardedProto := strings.TrimSpace(req.Header.Get("X-Forwarded-Proto")); len(forwardedProto) > 0 { + scheme = strings.TrimSpace(strings.Split(forwardedProto, ",")[0]) + } + + return fmt.Sprintf("%s://%s", scheme, host) +} + // readyHandler handles the readiness check endpoint // // @Summary Readiness check diff --git a/internal/daemon/server_test.go b/internal/daemon/server_test.go new file mode 100644 index 00000000..c2db3094 --- /dev/null +++ b/internal/daemon/server_test.go @@ -0,0 +1,39 @@ +package daemon + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/gin-gonic/gin" + "github.com/stretchr/testify/require" + "github.com/thand-io/agent/internal/config" +) + +func TestAPIConfigurationHandlerPrefersRequestOriginInServerMode(t *testing.T) { + t.Parallel() + + gin.SetMode(gin.TestMode) + + cfg := config.DefaultConfig() + cfg.SetMode(config.ModeServer) + require.NoError(t, cfg.SetLoginServer("http://localhost:5225")) + + server := NewServer(cfg) + router := gin.New() + router.GET("/.well-known/api-configuration", server.apiConfigurationHandler) + + req := httptest.NewRequest(http.MethodGet, "/.well-known/api-configuration", nil) + req.Host = "thand.test:5225" + rec := httptest.NewRecorder() + router.ServeHTTP(rec, req) + + require.Equal(t, http.StatusOK, rec.Code) + + var response struct { + BaseURL string `json:"baseUrl"` + } + require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &response)) + require.Equal(t, "http://thand.test:5225", response.BaseURL) +} diff --git a/internal/daemon/static/elevate_static.html b/internal/daemon/static/elevate_static.html index 956a58fa..0631bdf7 100644 --- a/internal/daemon/static/elevate_static.html +++ b/internal/daemon/static/elevate_static.html @@ -144,6 +144,12 @@

Select Identities to Assign Role

+ +