From b7637bc70a4c1792d0b41c07aea1ae47b2d89678 Mon Sep 17 00:00:00 2001 From: motatoes Date: Tue, 19 May 2026 15:03:44 -0700 Subject: [PATCH 01/32] worker: multi-cloud secrets + spot preemption monitor + AWS Packer AMI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lands the worker-side changes needed to run an opencomputer cell on AWS. Companion infra change is in opencomputer-infra@feat/aws-us-east-2-poc. Secrets provider abstraction: - internal/config/secrets.go (new): SecretsProvider interface, LoadSecrets() factory, cloud-agnostic secretMapping (was in keyvault.go). - internal/config/secretsmanager.go (new): awsSecretsManagerProvider — lists secrets under OPENSANDBOX_AWS_SECRETS_PREFIX and dereferences each via secretMapping. Uses already-vendored aws-sdk-go-v2/service/secretsmanager. - internal/config/keyvault.go: refactored as azureKeyVaultProvider implementing the interface. Same mode-prefix + env-precedence semantics preserved. - internal/config/config.go: add AWSSecretsPrefix, Cloud, CPUOvercommitRatio fields. CPUOvercommitRatio multiplies MaxCapacity at Load time so the heartbeat advertises inflated capacity. - 3 LoadSecretsFromKeyVault() call sites updated to LoadSecrets(). Spot preemption monitor: - internal/preemption/ (new): Monitor interface, AWS IMDSv2 spot-action poller, Azure scheduled-events stub, no-op fallback. Watch(ctx) returns a Notice channel; cmd/worker/main.go reacts by stopping the heartbeat (CP detects worker gone within heartbeat-stale timeout). - internal/worker/redis_heartbeat.go: Stop() made idempotent via sync.Once so both the preemption goroutine and `defer hb.Stop()` are safe. Cloud-aware vector populator: - deploy/vector/populate-vector-env.sh: detect_cloud() probes IMDS, case-branches secret fetch (Azure REST vs aws secretsmanager get-secret-value). Backwards-compatible — existing Azure deployments are unchanged. AWS Packer template: - deploy/packer/worker-ami-aws.pkr.hcl (new): amazon-ebs source, Ubuntu 24.04 LTS, c5.4xlarge builder (non-metal — bake doesn't run guest VMs), reuses the cloud-agnostic setup-azure-host.sh, installs awscli v2, builds golden rootfs, tags AMI for the aws_ami data-source lookup in the infra leaf. Docs: - deploy/worker.env.example: document new env vars (OPENSANDBOX_AWS_SECRETS_PREFIX, OPENSANDBOX_CLOUD, OPENSANDBOX_CPU_OVERCOMMIT_RATIO) and the asymmetry between safe-to-overcommit CPU vs always-1:1 memory. --- cmd/server/main.go | 235 ++----------------- cmd/worker/golden_upload.go | 15 +- cmd/worker/main.go | 269 +++++----------------- deploy/packer/worker-ami-aws.pkr.hcl | 324 ++++++++++++++++----------- deploy/vector/populate-vector-env.sh | 112 ++++++--- deploy/worker.env.example | 28 ++- internal/config/config.go | 36 +++ internal/config/keyvault.go | 228 +++++-------------- internal/config/secrets.go | 213 ++++++++++++++++++ internal/config/secretsmanager.go | 142 ++++++++---- internal/preemption/aws.go | 172 ++++++++++++++ internal/preemption/azure.go | 47 ++++ internal/preemption/monitor.go | 96 ++++++++ internal/worker/redis_heartbeat.go | 20 +- 14 files changed, 1119 insertions(+), 818 deletions(-) create mode 100644 internal/config/secrets.go create mode 100644 internal/preemption/aws.go create mode 100644 internal/preemption/azure.go create mode 100644 internal/preemption/monitor.go diff --git a/cmd/server/main.go b/cmd/server/main.go index efc2d557..b1546cc2 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -7,13 +7,12 @@ import ( "log" "log/slog" "os" - "os/signal" "strconv" "strings" + "os/signal" "syscall" - "time" - "github.com/redis/go-redis/v9" + "time" "github.com/opensandbox/opensandbox/internal/api" "github.com/opensandbox/opensandbox/internal/auth" @@ -24,7 +23,6 @@ import ( "github.com/opensandbox/opensandbox/internal/controlplane" "github.com/opensandbox/opensandbox/internal/crypto" "github.com/opensandbox/opensandbox/internal/db" - "github.com/opensandbox/opensandbox/internal/edgeclient" "github.com/opensandbox/opensandbox/internal/metrics" "github.com/opensandbox/opensandbox/internal/obslog" "github.com/opensandbox/opensandbox/internal/observability" @@ -37,16 +35,10 @@ import ( var ServerVersion = "dev" func main() { - // Load secrets from the appropriate cloud vault (before config.Load reads - // env vars). Each loader is a no-op if its env trigger is unset: - // SECRETS_VAULT_NAME → Azure Key Vault (Azure cells) - // OPENSANDBOX_SECRETS_ARN → AWS Secrets Manager (AWS cells) - // One or the other should be set at a time; neither = env file authoritative. - if err := config.LoadSecretsFromKeyVault(); err != nil { - log.Fatalf("failed to load secrets from Key Vault: %v", err) - } - if err := config.LoadSecretsFromSecretsManager(); err != nil { - log.Fatalf("failed to load secrets from Secrets Manager: %v", err) + // Load secrets from the configured cloud secret store (Azure KV or AWS SM) + // before config.Load reads env vars. No-op if neither is configured. + if err := config.LoadSecrets(); err != nil { + log.Fatalf("failed to load secrets: %v", err) } cfg, err := config.Load() @@ -84,14 +76,10 @@ func main() { // Build server options opts := &api.ServerOpts{ - Mode: cfg.Mode, - WorkerID: cfg.WorkerID, - Region: cfg.Region, - HTTPAddr: cfg.HTTPAddr, - CellID: cfg.CellID, - SessionJWTSecret: cfg.SessionJWTSecret, - CFAdminSecret: cfg.CFAdminSecret, - CFEventSecret: cfg.CFEventSecret, + Mode: cfg.Mode, + WorkerID: cfg.WorkerID, + Region: cfg.Region, + HTTPAddr: cfg.HTTPAddr, } // Initialize PostgreSQL if configured @@ -184,29 +172,6 @@ func main() { if err != nil { log.Fatalf("failed to connect to Redis: %v", err) } - // Reconcile-on-reconnect: when a worker rejoins after being pruned - // for missed heartbeats, run both directions of the cell-vs-worker - // state reconcile. See internal/controlplane/reconcile.go for the - // full rationale on each. Captured opts.Store + cfg.CellID directly; - // the closure reads them at call time. - // - // reverse first: cell-running but worker-doesn't-have-it → - // close on cell side (UpdateSessionStatus + EndScaleEvent + - // publish stopped event). Stops the billing leak immediately. - // - // forward second: cell-stopped but worker-still-hosting → - // re-issue Destroy via RPC. Cleans the worker side. - // - // Reverse runs first because the more urgent dollars-on-fire case is - // the still-open scale event accruing minute-by-minute, not a stray - // qemu the worker still has. - redisRegistry.OnWorkerRejoined(func(workerID string) { - ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second) - defer cancel() - controlplane.ReconcileRunningOnWorker(ctx, redisRegistry, opts.Store, cfg.CellID, workerID) - controlplane.ReconcileStoppedOnWorker(ctx, redisRegistry, opts.Store, workerID) - }) - redisRegistry.Start() defer redisRegistry.Stop() opts.WorkerRegistry = redisRegistry @@ -218,48 +183,6 @@ func main() { opts.SandboxAPIProxy = proxy.NewSandboxAPIProxy(opts.Store, redisRegistry, opts.JWTIssuer) log.Println("opensandbox: sandbox API proxy enabled (data-plane requests proxied to workers)") } - - // CF-parallel event forwarder. Drains events:{cell_id} from Redis and - // POSTs HMAC-signed batches to the events-ingest Worker. Inert when - // CFEventEndpoint is empty — old NATS path keeps running independently. - if cfg.CFEventEndpoint != "" && cfg.CFEventSecret != "" && cfg.CellID != "" { - cfClient := controlplane.NewCFEventClient(cfg.CFEventEndpoint, cfg.CFEventSecret, cfg.CellID) - fwd, err := controlplane.NewEventForwarder(controlplane.EventForwarderConfig{ - Redis: redisRegistry.RedisClient(), - CellID: cfg.CellID, - Client: cfClient, - }) - if err != nil { - log.Fatalf("event_forwarder: %v", err) - } - if err := fwd.Start(context.Background()); err != nil { - log.Fatalf("event_forwarder start: %v", err) - } - defer func() { - stopCtx, stopCancel := context.WithTimeout(context.Background(), 10*time.Second) - defer stopCancel() - _ = fwd.Stop(stopCtx) - }() - log.Printf("opensandbox: CF event forwarder started (endpoint=%s cell=%s)", cfg.CFEventEndpoint, cfg.CellID) - } else if cfg.Mode == "server" { - log.Printf("opensandbox: CF event forwarder NOT started (CFEventEndpoint/Secret/CellID unset)") - } - - // Capacity reporter — periodically pushes a cell_capacity event onto the - // same events:{cell_id} stream the forwarder drains. Feeds the edge's - // pickCell() cascade via D1. Inert when CellID is empty. - if cfg.CellID != "" { - cr, err := controlplane.NewCapacityReporter(controlplane.CapacityReporterConfig{ - Redis: redisRegistry.RedisClient(), - Registry: redisRegistry, - CellID: cfg.CellID, - }) - if err != nil { - log.Fatalf("capacity_reporter: %v", err) - } - cr.Start(context.Background()) - defer cr.Stop() - } } // Hoisted at function scope so the per-sandbox autoscaler (created @@ -271,63 +194,10 @@ func main() { // Initialize compute pool + autoscaler (server mode) if cfg.Mode == "server" && redisRegistry != nil { - // Build the WorkerSpec: cloud-neutral config that the CP supplies to - // whichever pool is selected. The pool combines this with cloud-specific - // cloud-init to launch new workers. - // - // Workers need to reach Postgres/Redis on the CP's private IP, - // not localhost. Replace localhost with the CP's IP if known. - cpIP := os.Getenv("OPENSANDBOX_CONTROLPLANE_IP") - workerDBURL := cfg.DatabaseURL - workerRedisURL := cfg.RedisURL - if cpIP != "" { - workerDBURL = strings.ReplaceAll(workerDBURL, "localhost", cpIP) - workerDBURL = strings.ReplaceAll(workerDBURL, "127.0.0.1", cpIP) - workerRedisURL = strings.ReplaceAll(workerRedisURL, "localhost", cpIP) - workerRedisURL = strings.ReplaceAll(workerRedisURL, "127.0.0.1", cpIP) - } - spec := compute.WorkerSpec{ - CellID: cfg.CellID, - Region: cfg.Region, - DatabaseURL: workerDBURL, - RedisURL: workerRedisURL, - JWTSecret: cfg.JWTSecret, - SessionJWTSecret: cfg.SessionJWTSecret, - CFEventEndpoint: cfg.CFEventEndpoint, - CFEventSecret: cfg.CFEventSecret, - CFAdminSecret: cfg.CFAdminSecret, - MaxCapacity: cfg.MaxCapacity, - SandboxDomain: cfg.SandboxDomain, - DefaultMemoryMB: cfg.DefaultSandboxMemoryMB, - DefaultCPUs: cfg.DefaultSandboxCPUs, - DefaultDiskMB: cfg.DefaultSandboxDiskMB, - S3Bucket: cfg.S3Bucket, - S3Region: cfg.S3Region, - S3Endpoint: cfg.S3Endpoint, - S3AccessKeyID: cfg.S3AccessKeyID, - S3SecretAccessKey: cfg.S3SecretAccessKey, - S3ForcePathStyle: cfg.S3ForcePathStyle, - SegmentWriteKey: cfg.SegmentWriteKey, - SecretsRef: cfg.SecretsARN, - } - - // Provider selection. Explicit cfg.ComputeProvider wins; otherwise we - // autodetect from existing fields for backwards compatibility. - provider := cfg.ComputeProvider - if provider == "" { - switch { - case cfg.AzureSubscriptionID != "" && (cfg.AzureImageID != "" || cfg.AzureKeyVaultName != ""): - provider = "azure" - case cfg.EC2AMI != "" || cfg.EC2SSMParameterName != "": - provider = "aws" - } - } - var pool compute.Pool var poolName string - switch provider { - case "azure": + if cfg.AzureSubscriptionID != "" && (cfg.AzureImageID != "" || cfg.AzureKeyVaultName != "") { // Build worker env template — new VMs get this via cloud-init. // GRPC_ADVERTISE, HTTP_ADDR, and WORKER_ID are patched by cloud-init // with the VM's actual private IP and hostname. @@ -433,7 +303,7 @@ func main() { if err != nil { log.Fatalf("opensandbox: failed to create Azure pool: %v", err) } - azPool.SetWorkerSpec(spec) + // If image not set statically but Key Vault is configured, fetch initial image if cfg.AzureImageID == "" && cfg.AzureKeyVaultName != "" { imgID, version, kvErr := azPool.RefreshAMI(context.Background()) if kvErr != nil { @@ -443,8 +313,8 @@ func main() { } pool = azPool poolName = fmt.Sprintf("Azure (size=%s, image=%s, keyvault=%s)", cfg.AzureVMSize, cfg.AzureImageID, cfg.AzureKeyVaultName) - - case "aws": + } else if cfg.EC2AMI != "" || cfg.EC2SSMParameterName != "" { + // AWS EC2 compute pool (AMI from config or dynamically from SSM) ec2Pool, err := compute.NewEC2Pool(compute.EC2PoolConfig{ Region: cfg.S3Region, AccessKeyID: cfg.S3AccessKeyID, @@ -461,7 +331,7 @@ func main() { if err != nil { log.Fatalf("opensandbox: failed to create EC2 pool: %v", err) } - ec2Pool.SetWorkerSpec(spec) + // If AMI not set statically but SSM is configured, fetch initial AMI from SSM if cfg.EC2AMI == "" && cfg.EC2SSMParameterName != "" { amiID, version, ssmErr := ec2Pool.RefreshAMI(context.Background()) if ssmErr != nil { @@ -471,11 +341,6 @@ func main() { } pool = ec2Pool poolName = fmt.Sprintf("EC2 (ami=%s, type=%s, ssm=%s)", cfg.EC2AMI, cfg.EC2InstanceType, cfg.EC2SSMParameterName) - - case "": - log.Println("opensandbox: no compute provider configured (combined mode, no autoscaling)") - default: - log.Fatalf("opensandbox: unknown compute provider %q (expected azure|aws)", provider) } if pool != nil { @@ -504,13 +369,6 @@ func main() { MaxWorkers: cfg.MaxWorkersPerRegion, IdleReserve: cfg.IdleReserveWorkers, MachineSizes: machineSizes, - // For "migrated" event emit after scaler-driven migrations - // (rolling replace, evacuation) — keeps D1 sandboxes_index - // worker_id in sync with cell-PG truth. Without this, the - // dashboard's "which worker is my sandbox on" view goes stale - // every time the autoscaler shuffles things around. - RedisClient: redisRegistry.RedisClient(), - CellID: cfg.CellID, }) defer scaler.Stop() @@ -555,21 +413,12 @@ func main() { for _, w := range redisRegistry.GetAllWorkers() { liveWorkers[w.ID] = true } - orphans, err := opts.Store.MarkOrphanedSandboxes(ctx, liveWorkers) + orphaned, err := opts.Store.MarkOrphanedSandboxes(ctx, liveWorkers) if err != nil { log.Printf("maintenance: orphan reconciliation error: %v", err) observability.CaptureError(err, "area", "maintenance", "op", "mark_orphaned_sandboxes") - } else if len(orphans) > 0 { - log.Printf("maintenance: marked %d sandboxes as error (worker lost)", len(orphans)) - // Mirror to D1 via the events stream. Without these XADDs, - // sandboxes_index keeps showing the rows as running on the - // dead worker indefinitely — the post-cutover ghost-row - // bug. Best-effort; the next tick will re-emit for any - // row still marked `error` on a dead worker if Redis - // rejected the first attempt. - if redisRegistry.RedisClient() != nil && cfg.CellID != "" { - publishStoppedFromMaintenance(ctx, redisRegistry.RedisClient(), cfg.CellID, orphans) - } + } else if orphaned > 0 { + log.Printf("maintenance: marked %d sandboxes as error (worker lost)", orphaned) } } }) @@ -606,15 +455,6 @@ func main() { // Create API server server := api.NewServer(mgr, ptyMgr, cfg.APIKey, opts) - // Wire the CF api-edge HTTP client. Used by resolveSecretStoreInto + - // resolveTemplate to read from D1 over HMAC instead of local PG once - // migration 041 strips the global tables. Falls back to s.store if - // either CFEdgeBaseURL or CFEventSecret is unset (combined dev mode). - if cfg.CFEdgeBaseURL != "" && cfg.CFEventSecret != "" { - server.SetEdgeClient(edgeclient.New(cfg.CFEdgeBaseURL, cfg.CFEventSecret)) - log.Printf("opensandbox: edge client wired (base=%s)", cfg.CFEdgeBaseURL) - } - // Wire Axiom read-only token for the sandbox session logs API. // Token never leaves this process; the UI proxies its queries through // /api/sandboxes/:id/logs. Empty token disables the endpoint (503). @@ -669,40 +509,11 @@ func main() { workers = redisRegistry } reporter := billing.NewUsageReporter(opts.Store, stripeClient, workers) - // CF billing mode: when this CP is wired into the CF event pipe, the - // CreditAccount DO is authoritative on free-tier balance. Disable - // the local free-tier deduction pass so both sides don't race. - if cfg.CFEventEndpoint != "" { - reporter.SetCFBillingMode(true) - log.Println("opensandbox: usage reporter CF-billing mode ON (free-tier deduction deferred to CreditAccount DO)") - } reporter.Start() defer reporter.Stop() log.Println("opensandbox: usage reporter started (interval=5m)") } - // Halt reconciler — safety net for missed CF halt webhooks. Pulls the - // authoritative halt-list from api-edge every 60s and re-issues halts - // for anything that should be halted but isn't. Inert unless - // OPENSANDBOX_HALT_LIST_URL is set. - if cfg.HaltListURL != "" && cfg.CFEventSecret != "" && server != nil { - reconciler := controlplane.NewHaltReconciler(controlplane.HaltReconcilerConfig{ - CellID: cfg.CellID, - ListURL: cfg.HaltListURL, - Secret: cfg.CFEventSecret, - Halter: server, - }) - if reconciler != nil { - reconciler.Start(ctx) - defer func() { - stopCtx, stopCancel := context.WithTimeout(context.Background(), 5*time.Second) - defer stopCancel() - _ = reconciler.Stop(stopCtx) - }() - log.Printf("opensandbox: halt reconciler started (list_url=%s, period=60s)", cfg.HaltListURL) - } - } - // Phase-2 capacity allocator. Writes outbox rows for unified-mode // pro orgs after each settled bucket. Allocator skips legacy and // free orgs (see ListAllocatorCandidates); rollback is by @@ -818,13 +629,3 @@ func getIntEnv(key string, def int) int { } return def } - -// publishStoppedFromMaintenance emits a `stopped` lifecycle event per -// orphaned sandbox so D1 sandboxes_index mirrors the PG sweep done by -// MarkOrphanedSandboxes. Without these XADDs, the maintenance loop's -// dead-worker cleanup is invisible to the dashboard. -func publishStoppedFromMaintenance(ctx context.Context, rdb *redis.Client, cellID string, orphans []db.OrphanedSandbox) { - for _, o := range orphans { - controlplane.PublishLifecycle(ctx, rdb, cellID, "stopped", o.SandboxID, o.WorkerID, o.OrgID, "worker_lost") - } -} diff --git a/cmd/worker/golden_upload.go b/cmd/worker/golden_upload.go index 636a2179..c0fc04c8 100644 --- a/cmd/worker/golden_upload.go +++ b/cmd/worker/golden_upload.go @@ -30,14 +30,13 @@ import ( // Reads OPENSANDBOX_GLOBAL_BLOB_* env vars for endpoint + creds. Fails loud // if blobstore isn't configured. func uploadGolden(path string) error { - // Bootstrap from the appropriate cloud vault — same pattern as main(). One - // of these picks up worker-global-blob-* secrets so the upload knows where - // to push; the other is a no-op when its trigger env var isn't set. - if err := config.LoadSecretsFromKeyVault(); err != nil { - return fmt.Errorf("keyvault bootstrap: %w", err) - } - if err := config.LoadSecretsFromSecretsManager(); err != nil { - return fmt.Errorf("secretsmanager bootstrap: %w", err) + // Bootstrap from the configured secret store the same way the main + // worker does — so this subcommand picks up worker-global-blob-* secrets + // when run on a host whose worker.env points at a KV / SM prefix. + // Without this, the user would have to manually export every Tigris env + // var inline. + if err := config.LoadSecrets(); err != nil { + return fmt.Errorf("secrets bootstrap: %w", err) } cfg, err := config.Load() if err != nil { diff --git a/cmd/worker/main.go b/cmd/worker/main.go index aed3e79b..16e28879 100644 --- a/cmd/worker/main.go +++ b/cmd/worker/main.go @@ -14,18 +14,15 @@ import ( "syscall" "time" - "github.com/google/uuid" - "github.com/redis/go-redis/v9" - "github.com/opensandbox/opensandbox/internal/analytics" "github.com/opensandbox/opensandbox/internal/auth" "github.com/opensandbox/opensandbox/internal/blobstore" - "github.com/opensandbox/opensandbox/internal/cellevents" "github.com/opensandbox/opensandbox/internal/config" "github.com/opensandbox/opensandbox/internal/db" "github.com/opensandbox/opensandbox/internal/metrics" "github.com/opensandbox/opensandbox/internal/observability" "github.com/opensandbox/opensandbox/internal/obslog" + "github.com/opensandbox/opensandbox/internal/preemption" "github.com/opensandbox/opensandbox/internal/proxy" qm "github.com/opensandbox/opensandbox/internal/qemu" "github.com/opensandbox/opensandbox/internal/sandbox" @@ -44,7 +41,7 @@ var WorkerVersion = "dev" func main() { // Subcommands that don't need config/secrets. Must short-circuit before - // LoadSecretsFromKeyVault, which is slow and would fail outside Azure. + // LoadSecrets, which is slow and would fail without cloud credentials. // // "golden-version " prints the full-file hash used for golden-image // archive keys. Packer invokes this so the archive key matches what @@ -82,17 +79,10 @@ func main() { return } - // Load secrets from the appropriate cloud vault (before config.Load reads - // env vars). Each loader is a no-op if its env trigger is unset: - // SECRETS_VAULT_NAME → Azure Key Vault (Azure cells) - // OPENSANDBOX_SECRETS_ARN → AWS Secrets Manager (AWS cells) - // One or the other should be set at a time; neither = env file authoritative. - if err := config.LoadSecretsFromKeyVault(); err != nil { + // Load secrets from Azure Key Vault if configured (before config.Load reads env vars). + if err := config.LoadSecrets(); err != nil { log.Fatalf("failed to load secrets from Key Vault: %v", err) } - if err := config.LoadSecretsFromSecretsManager(); err != nil { - log.Fatalf("failed to load secrets from Secrets Manager: %v", err) - } cfg, err := config.Load() if err != nil { @@ -136,10 +126,6 @@ func main() { var doGracefulShutdown func(checkpointStore *storage.CheckpointStore, store *db.Store) // Metadata server (set by QEMU backend, wired to store later) var metadataSrv *worker.MetadataServer - // Forward-declared so doGracefulShutdown's closure (built in the QEMU - // init block below) can capture it; the actual NewSandboxDBManager call - // happens after backend init. - var sandboxDBMgr *sandbox.SandboxDBManager // Initialize secrets proxy for MITM token substitution. // Runs on :3128 — VMs route HTTPS through this to keep real secrets off-VM. @@ -327,7 +313,7 @@ func main() { log.Printf("opensandbox-worker: %d VMs failed to hibernate: %v", len(failed), failed) } - processHibernateResults(results, store, checkpointStore, sandboxDBMgr, func(r interface{}) (string, string, error) { + processHibernateResults(results, store, checkpointStore, func(r interface{}) (string, string, error) { hr := r.(qm.HibernateAllResult) return hr.SandboxID, hr.HibernationKey, hr.Err }) @@ -354,9 +340,8 @@ func main() { ptyMgr := sandbox.NewAgentPTYManager(ptySessionFactory) defer ptyMgr.CloseAll() - // Initialize per-sandbox SQLite manager (forward-declared above so the - // graceful-shutdown closure can capture it). - sandboxDBMgr = sandbox.NewSandboxDBManager(cfg.DataDir) + // Initialize per-sandbox SQLite manager + sandboxDBMgr := sandbox.NewSandboxDBManager(cfg.DataDir) defer sandboxDBMgr.Close() // JWT issuer @@ -428,21 +413,11 @@ func main() { defer store.Close() log.Println("opensandbox-worker: PostgreSQL store connected (auto-wake enabled)") - hibernated, stopped, err := store.ReconcileWorkerSessions(ctx, cfg.WorkerID) + _, stopped, err := store.ReconcileWorkerSessions(ctx, cfg.WorkerID) if err != nil { log.Printf("opensandbox-worker: warning: session reconciliation failed: %v", err) - } else { - if len(stopped) > 0 { - log.Printf("opensandbox-worker: reconciled %d unrecoverable sessions as stopped", len(stopped)) - } - if len(hibernated) > 0 { - log.Printf("opensandbox-worker: reconciled %d sessions as hibernated", len(hibernated)) - } - // Mirror PG state changes to D1 via the cell events stream. - // Without these XADDs, the dashboard keeps the rows at the - // pre-restart state ("running" on a worker that just rebooted). - emitReconcileEvents(ctx, cfg, "hibernated", "worker_restart", hibernated) - emitReconcileEvents(ctx, cfg, "stopped", "worker_restart", stopped) + } else if stopped > 0 { + log.Printf("opensandbox-worker: reconciled %d unrecoverable sessions as stopped", stopped) } // Wire up metadata server billing callback @@ -500,20 +475,6 @@ func main() { } _ = store.UpdateSandboxSessionStatus(context.Background(), sandboxID, "hibernated", nil) } - // Emit "hibernated" lifecycle event so events-ingest mirrors the - // status flip to D1 sandboxes_index. Without this, idle-timeout - // hibernations only land in cell PG and D1 keeps showing the - // sandbox as running on a worker that no longer hosts it. - if sandboxDBMgr != nil { - if sdb, dbErr := sandboxDBMgr.Get(sandboxID); dbErr == nil { - _ = sdb.LogEvent("hibernated", map[string]string{ - "sandbox_id": sandboxID, - "checkpoint_key": result.HibernationKey, - "reason": "idle_timeout", - }) - } - _ = sandboxDBMgr.Remove(sandboxID) - } }, OnKill: func(sandboxID string) { log.Printf("opensandbox-worker: sandbox %s killed on timeout", sandboxID) @@ -521,17 +482,6 @@ func main() { if store != nil { _ = store.UpdateSandboxSessionStatus(context.Background(), sandboxID, "stopped", nil) } - // Same fix as OnHibernate above — D1 needs a "stopped" event so - // the dashboard doesn't keep the row at "running" forever. - if sandboxDBMgr != nil { - if sdb, dbErr := sandboxDBMgr.Get(sandboxID); dbErr == nil { - _ = sdb.LogEvent("stopped", map[string]string{ - "sandbox_id": sandboxID, - "reason": "kill_timeout", - }) - } - _ = sandboxDBMgr.Remove(sandboxID) - } }, }) defer sbRouter.Close() @@ -688,92 +638,58 @@ func main() { fixed, err := store.ReconcileWorkerReconnect(context.Background(), cfg.WorkerID, runningIDs) if err != nil { log.Printf("opensandbox-worker: reconnect reconciliation failed: %v", err) - } else if len(fixed) > 0 { - log.Printf("opensandbox-worker: reconnect reconciliation: %d sessions restored to running", len(fixed)) - // Mirror to D1: emit `running` events so the dashboard - // reflects the recovery. Without these, D1 keeps the - // `error` status from the maintenance loop's previous - // sweep — customer sees their sandbox as broken until - // the next state-changing event. - // - // Bounded context so a redis stall during shutdown can't - // block this callback forever. Budget: 10s per event - // (3s XADD × up to 3 retries) × len(fixed), capped at - // 60s overall. Reconnect storms rarely produce more - // than a handful of fixed rows. - emitCtx, emitCancel := context.WithTimeout(context.Background(), 60*time.Second) - emitReconcileEvents(emitCtx, cfg, "running", "worker_reconnect", fixed) - emitCancel() + } else if fixed > 0 { + log.Printf("opensandbox-worker: reconnect reconciliation: %d sessions restored to running", fixed) } }) } defer hb.Stop() log.Println("opensandbox-worker: Redis heartbeat started") - } - } - // CF-parallel: Redis Streams event publisher. Inert unless CellID is set. - // (The legacy NATS publisher used to run alongside this; it was removed - // once Redis Streams covered all event types end-to-end. NATSURL in the - // env file is ignored.) - if cfg.CellID != "" && cfg.RedisURL != "" { - // Resolver: look up sandbox → org → plan via cell-local PG. Called - // per event during flush; sandbox_sessions has an indexed lookup on - // sandbox_id and orgs is keyed by org_id, so each call is two index - // hits. usage_tick volume is sandboxes × ~30s, so cost is bounded. - // nil store (no PG) leaves the fields blank — events-ingest then - // treats them as "unknown plan" and skips DO debit, which is the - // safe fallback. - var planResolver worker.MetadataResolver - if store != nil { - st := store - planResolver = func(sandboxID string) (string, string, bool) { - ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) - defer cancel() - orgIDStr, err := st.GetSandboxOrgID(ctx, sandboxID) - if err != nil || orgIDStr == "" { - return "", "", false + // Spot-preemption monitor. NewMonitor returns a no-op on + // non-cloud deployments — the goroutine still spins but + // never fires. When OPENSANDBOX_CLOUD=aws the AWS monitor + // polls IMDSv2 every 5s for /latest/meta-data/spot/instance-action. + // + // On a Notice we drain in the most minimal way possible for + // the PoC: stop the heartbeat so the CP sees us as gone and + // re-schedules our sandboxes. The hibernate-each-sandbox path + // is the next iteration — for now, sandboxes on a preempted + // host fail and the customer re-creates. This matches the + // "PoC accepts sandbox state loss on reclaim" risk in the plan. + preemptMon := preemption.NewMonitor() + go func() { + notices := preemptMon.Watch(ctx) + for notice := range notices { + log.Printf("opensandbox-worker: PREEMPTION notice from %s — action=%s eta=%s, draining now", + preemptMon.Name(), notice.Action, notice.ETA.Format(time.RFC3339)) + hb.Stop() + // TODO: hibernate live sandboxes via mgr to S3 within + // the ETA budget before exiting. Until then the + // kernel/systemd terminates us when the cloud reclaims. + return } - orgID, err := uuid.Parse(orgIDStr) - if err != nil { - return "", "", false - } - org, err := st.GetOrg(ctx, orgID) - if err != nil { - // Org row may not exist yet (new app2 user, lazy upsert - // hasn't run from this sandbox's create). Return the - // org_id so events-ingest can still archive; leave plan - // blank so it skips DO debit until the row exists. - return orgIDStr, "", true - } - return orgIDStr, org.Plan, true - } + }() } - redisPub, err := worker.NewRedisEventPublisher(worker.RedisEventPublisherConfig{ - RedisURL: cfg.RedisURL, - SandboxDBs: sandboxDBMgr, - CellID: cfg.CellID, - WorkerID: cfg.WorkerID, - Resolver: planResolver, - }) + } + + // NATS + if cfg.NATSURL != "" { + pub, err := worker.NewEventPublisher(cfg.NATSURL, cfg.Region, cfg.WorkerID, sandboxDBMgr) if err != nil { - log.Printf("opensandbox-worker: Redis event publisher init failed: %v (continuing)", err) + log.Printf("opensandbox-worker: NATS not available: %v (continuing without event sync)", err) } else { - // Wire the publisher as the SandboxDBManager's OnRemove hook so - // terminal events (stopped, hibernated) are synchronously flushed - // to Redis BEFORE the SQLite file is deleted. The destroy / - // hibernate gRPC handlers LogEvent + then call Remove; this hook - // closes the race that previously dropped those events. - sandboxDBMgr.SetOnRemove(func(sandboxID string) { - redisPub.FlushSandbox(context.Background(), sandboxID) + pub.Start() + if qemuMgr != nil { + pub.SetGoldenVersion(qemuMgr.GoldenVersion()) + } + pub.StartHeartbeat(func() (int, int, float64, float64, float64) { + count, _ := mgr.Count(context.Background()) + cpuPct, memPct, diskPct := worker.SystemStats() + return cfg.MaxCapacity, count, cpuPct, memPct, diskPct }) - redisPub.Start(context.Background()) - defer func() { - stopCtx, stopCancel := context.WithTimeout(context.Background(), 5*time.Second) - defer stopCancel() - _ = redisPub.Stop(stopCtx) - }() - log.Printf("opensandbox-worker: Redis event publisher started (stream=events:%s)", cfg.CellID) + defer pub.Stop() + log.Println("opensandbox-worker: NATS event publisher started") } } @@ -781,25 +697,6 @@ func main() { autosaver := worker.NewWorkspaceAutosaver(mgr, autosaverSyncer, 5*time.Minute) autosaver.Start() - // Usage ticker — drives the free-tier billing loop. Emits a usage_tick - // per running sandbox every 20s; events-ingest fans out to per-org - // CreditAccount DOs, which debit balance + dispatch halt when it - // hits zero. Without this the free-tier balance never decrements. - // Inert unless CellID is set (combined-mode dev without Redis stream - // would write events to /dev/null since there's no consumer). - if cfg.CellID != "" && mgr != nil { - usageTicker := worker.NewUsageTicker(mgr, sandboxDBMgr, 20*time.Second, 10) - if usageTicker != nil { - usageTicker.Start(context.Background()) - defer func() { - stopCtx, stopCancel := context.WithTimeout(context.Background(), 3*time.Second) - defer stopCancel() - _ = usageTicker.Stop(stopCtx) - }() - log.Println("opensandbox-worker: usage ticker started (interval=20s, 10¢/tick)") - } - } - // Segment analytics — ships per-org GB-seconds memory usage. nil if SEGMENT_WRITE_KEY unset. segmentClient := analytics.New(cfg.SegmentWriteKey) if segmentClient != nil { @@ -825,25 +722,9 @@ func main() { OnHibernateIdle: func(sandboxIDs []string) { for _, id := range sandboxIDs { if checkpointStore != nil { - result, err := mgr.Hibernate(context.Background(), id, checkpointStore) + _, err := mgr.Hibernate(context.Background(), id, checkpointStore) if err != nil { log.Printf("pressure-hibernate %s: %v", id, err) - continue - } - // Mirror the per-sandbox SQLite event the gRPC Hibernate - // handler writes — without this, auto-hibernate writes - // the SUCCEED to local PG but never publishes "hibernated" - // to events-ingest, so D1 sandboxes_index drifts to - // "running" while the cell PG says "hibernated". - if sandboxDBMgr != nil { - if sdb, dbErr := sandboxDBMgr.Get(id); dbErr == nil { - _ = sdb.LogEvent("hibernated", map[string]string{ - "sandbox_id": id, - "checkpoint_key": result.HibernationKey, - "reason": "pressure_auto", - }) - } - _ = sandboxDBMgr.Remove(id) } } } @@ -1126,15 +1007,7 @@ func deleteOldHibernation(store *storage.CheckpointStore, key string) { } // processHibernateResults handles results from HibernateAll for both backends. -// -// In addition to updating cell-local PG, we LogEvent("hibernated") into the -// per-sandbox SQLite then call sandboxDBs.Remove — the Remove hook flushes -// any unsynced events (including this one) to Redis Streams synchronously. -// events-ingest then mirrors the state to D1, keeping the dashboard list in -// sync. Without this, the bulk-shutdown path silently skipped the lifecycle -// event the gRPC HibernateSandbox handler emits per call, and D1 stayed -// "running" until something else nudged it. -func processHibernateResults(results interface{}, store *db.Store, checkpointStore *storage.CheckpointStore, sandboxDBs *sandbox.SandboxDBManager, extract func(interface{}) (string, string, error)) { +func processHibernateResults(results interface{}, store *db.Store, checkpointStore *storage.CheckpointStore, extract func(interface{}) (string, string, error)) { switch rs := results.(type) { case []qm.HibernateAllResult: for _, r := range rs { @@ -1144,12 +1017,6 @@ func processHibernateResults(results interface{}, store *db.Store, checkpointSto errMsg := "hibernate failed on shutdown: " + r.Err.Error() _ = store.UpdateSandboxSessionStatus(context.Background(), r.SandboxID, "stopped", &errMsg) } - if sandboxDBs != nil { - if sdb, err := sandboxDBs.Get(r.SandboxID); err == nil { - _ = sdb.LogEvent("stopped", map[string]string{"reason": "hibernate failed on shutdown"}) - } - _ = sandboxDBs.Remove(r.SandboxID) - } continue } log.Printf("opensandbox-worker: hibernated %s (key=%s)", r.SandboxID, r.HibernationKey) @@ -1162,40 +1029,10 @@ func processHibernateResults(results interface{}, store *db.Store, checkpointSto _ = store.UpdateSandboxSessionStatus(context.Background(), r.SandboxID, "hibernated", nil) } } - if sandboxDBs != nil { - if sdb, err := sandboxDBs.Get(r.SandboxID); err == nil { - _ = sdb.LogEvent("hibernated", map[string]string{"key": r.HibernationKey, "reason": "graceful_shutdown"}) - } - _ = sandboxDBs.Remove(r.SandboxID) - } } } } -// emitReconcileEvents XADDs a `cellevents.PublishLifecycle` per orphan to the -// cell's events stream. Used by the worker-startup and worker-reconnect -// reconcilers — both run rarely (boot, network blip) and need a redis client -// just for the emit. Keeping the construction here means one URL parse + one -// pool init + one Close per call site, instead of inlining the same dance -// twice in main(). ctx must be bounded by the caller; the function does not -// add its own timeout (cellevents.PublishLifecycle has a 3s XADD timeout per -// attempt with up to 3 retries, so worst case ~10s per event). -func emitReconcileEvents(ctx context.Context, cfg *config.Config, eventType, reason string, orphans []db.OrphanedSandbox) { - if len(orphans) == 0 || cfg.RedisURL == "" || cfg.CellID == "" { - return - } - opts, err := redis.ParseURL(cfg.RedisURL) - if err != nil { - log.Printf("opensandbox-worker: reconcile emit (%s): redis URL parse failed: %v — events skipped", eventType, err) - return - } - rdb := redis.NewClient(opts) - defer rdb.Close() - for _, o := range orphans { - cellevents.PublishLifecycle(ctx, rdb, cfg.CellID, eventType, o.SandboxID, o.WorkerID, o.OrgID, reason) - } -} - // recoverLocalQEMU handles local disk recovery for QEMU backend. func recoverLocalQEMU(ctx context.Context, qmMgr *qm.Manager, store *db.Store, cfg *config.Config) { recoveries := qmMgr.RecoverLocalSandboxes() diff --git a/deploy/packer/worker-ami-aws.pkr.hcl b/deploy/packer/worker-ami-aws.pkr.hcl index 765ece05..f00d9fef 100644 --- a/deploy/packer/worker-ami-aws.pkr.hcl +++ b/deploy/packer/worker-ami-aws.pkr.hcl @@ -1,17 +1,36 @@ -# worker-ami-aws.pkr.hcl — Packer template for AWS EC2 worker AMIs. +# worker-ami-aws.pkr.hcl — Build an immutable AMI for OpenSandbox workers (QEMU backend) on AWS. # -# Counterpart to worker-ami.pkr.hcl (which builds Azure managed images + -# publishes them to a Compute Gallery). This builds an AMI in EC2 and -# optionally publishes the AMI ID to SSM Parameter Store so EC2Pool's -# RefreshAMI loop picks it up automatically. +# Mirrors deploy/packer/worker-ami.pkr.hcl (Azure variant) but targets the +# amazon-ebs builder. The setup script (`deploy/azure/setup-azure-host.sh`) +# is cloud-agnostic in practice — it installs QEMU + kernel modules + systemd +# units + Vector and never talks to Azure-specific APIs. We reuse it as-is. # -# Build: packer build -var "aws_region=us-east-1" deploy/packer/worker-ami-aws.pkr.hcl +# Differences from the Azure file: +# - amazon-ebs source on Ubuntu 24.04 LTS x86_64 instead of azure-arm. +# - No rootfs blob caching (the Azure variant's elaborate Azure-blob cache +# dance was the only Azure-API touch; for the PoC we just rebuild the +# rootfs each time, ~10min extra per bake — acceptable for low rebuild +# frequency). +# - Installs awscli (needed by deploy/vector/populate-vector-env.sh AWS path +# and by the worker user-data shared-disk attach). +# - Tags the AMI for the terraform `aws_ami` data source lookup +# (opensandbox-role=worker, opensandbox-cloud=aws). # -# Inputs (most overridable via -var or env vars): -# - aws_region AWS region to build in -# - source_ami_filter Filter for the base Ubuntu AMI (default: 24.04 LTS) -# - instance_type Builder instance type (small is fine) -# - ssm_param_name If set, AMI ID is written here on success +# Usage: +# # 1. Build binaries for linux/amd64: +# CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -ldflags "-X main.WorkerVersion=$(git rev-parse --short HEAD)" \ +# -o bin/opensandbox-worker ./cmd/worker/ +# CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o bin/osb-agent ./cmd/agent/ +# +# # 2. Build the rootfs context tarball: +# tar czf /tmp/packer-rootfs-ctx.tar.gz deploy/firecracker/rootfs/ deploy/ec2/build-rootfs-docker.sh scripts/claude-agent-wrapper/ +# +# # 3. Run packer: +# packer init deploy/packer/worker-ami-aws.pkr.hcl +# packer build -var "worker_version=$(git rev-parse --short HEAD)" deploy/packer/worker-ami-aws.pkr.hcl +# +# # 4. The data source in opencomputer-infra/terraform/aws/us-east-2-poc/ami.tf +# # picks up the new AMI on the next `tofu apply`. packer { required_plugins { @@ -22,116 +41,117 @@ packer { } } -variable "aws_region" { - type = string - default = "us-east-1" -} - -variable "instance_type" { - type = string - default = "t3.medium" -} - -variable "ami_name_prefix" { - type = string - default = "opensandbox-worker" -} +# --------------------------------------------------------------------- +# Variables +# --------------------------------------------------------------------- variable "worker_version" { type = string - description = "Worker version tag (e.g. git short SHA). Used in AMI name + tags." + description = "Worker version (git SHA). Baked into AMI name and tags." } -variable "ssm_param_name" { - type = string - description = "If set, the built AMI ID is written to this SSM parameter (e.g. /opensandbox/dev/worker-ami-id)." - default = "" +variable "agent_version" { + type = string + default = "" } -variable "guest_image_version" { - type = string - description = "Hash of the canonical guest rootfs (default.ext4). Pulled from R2/S3 during AMI build." - default = "" +variable "region" { + type = string + default = "us-east-2" } -variable "guest_image_url" { +variable "instance_type" { type = string - description = "Canonical URL for the guest rootfs blob (e.g. https://r2.opensandbox.dev/golden-store/bases/{hash}/default.ext4)." - default = "" + default = "c5.4xlarge" + description = "Builder instance type. Needs enough memory for Docker rootfs build (~8GB) but doesn't need to run guest VMs, so non-metal is fine and saves ~10× vs c5.metal." } variable "worker_binary" { - type = string - default = "bin/opensandbox-worker" - description = "Path to the cross-compiled worker binary (linux/amd64)." + type = string + default = "bin/opensandbox-worker" } variable "agent_binary" { - type = string - default = "bin/osb-agent" - description = "Path to the cross-compiled agent binary (linux/amd64)." + type = string + default = "bin/osb-agent" } -variable "kernel_path" { +variable "rootfs_context" { type = string - default = "deploy/firecracker/vmlinux" - description = "Path to the Linux kernel image baked into the AMI." + default = "/tmp/packer-rootfs-ctx.tar.gz" + description = "Pre-built tarball of rootfs + agent wrapper sources." } -# Latest Ubuntu 24.04 LTS amd64 from Canonical's account. -data "amazon-ami" "ubuntu" { - region = var.aws_region - most_recent = true - owners = ["099720109477"] - filters = { - name = "ubuntu/images/hvm-ssd-gp3/ubuntu-noble-24.04-amd64-server-*" - root-device-type = "ebs" - virtualization-type = "hvm" - architecture = "x86_64" - } +variable "golden_cache_bucket" { + type = string + default = "" + description = "Optional S3 bucket to upload the bake's golden default.ext4 to (under bases//). Cell-scoped — e.g. oc-aws-us-east-2-poc-golden-cache. Empty = skip upload." } +# --------------------------------------------------------------------- +# Source +# --------------------------------------------------------------------- + source "amazon-ebs" "worker" { - region = var.aws_region - source_ami = data.amazon-ami.ubuntu.id + region = var.region instance_type = var.instance_type ssh_username = "ubuntu" + ssh_pty = true - ami_name = "${var.ami_name_prefix}-${var.worker_version}-${formatdate("YYYYMMDD-hhmmss", timestamp())}" - ami_description = "OpenSandbox worker AMI — version ${var.worker_version}" + ami_name = "opensandbox-worker-${var.worker_version}-${formatdate("YYYYMMDD-hhmm", timestamp())}" + ami_description = "OpenSandbox worker AMI (Ubuntu 24.04, QEMU/KVM nested-virt). Built from git ${var.worker_version}." - tags = { - Name = "${var.ami_name_prefix}-${var.worker_version}" - "opensandbox:version" = var.worker_version - "opensandbox:role" = "worker" - "opensandbox:guest_version" = var.guest_image_version + source_ami_filter { + filters = { + name = "ubuntu/images/hvm-ssd-gp3/ubuntu-noble-24.04-amd64-server-*" + architecture = "x86_64" + virtualization-type = "hvm" + root-device-type = "ebs" + } + most_recent = true + owners = ["099720109477"] # Canonical } - # Bigger root volume so we can stage rootfs images and tools. + ena_support = true + sriov_support = true + launch_block_device_mappings { device_name = "/dev/sda1" - volume_size = 30 + volume_size = 50 volume_type = "gp3" delete_on_termination = true } + + # AMI tags — the terraform `aws_ami` data source in the AWS leaf filters + # on these to pick the most-recent worker AMI for this cloud. + tags = { + Name = "opensandbox-worker-${var.worker_version}" + "opensandbox-role" = "worker" + "opensandbox-cloud" = "aws" + "opensandbox-version" = var.worker_version + } + + # Volume snapshot tag — propagates so the EBS snapshot underlying the AMI + # has the same provenance metadata as the AMI itself. + snapshot_tags = { + "opensandbox-role" = "worker" + "opensandbox-cloud" = "aws" + "opensandbox-version" = var.worker_version + } + + run_tags = { + Name = "packer-opensandbox-worker-build" + } } +# --------------------------------------------------------------------- +# Build +# --------------------------------------------------------------------- + build { sources = ["source.amazon-ebs.worker"] - # Install system packages (KVM, QEMU, mdadm, xfs tools, etc.) - provisioner "shell" { - inline = [ - "set -euo pipefail", - "export DEBIAN_FRONTEND=noninteractive", - "sudo apt-get update -qq", - "sudo apt-get install -y -qq qemu-system-x86 qemu-utils mdadm xfsprogs e2fsprogs jq curl ca-certificates", - "sudo systemctl disable --now systemd-resolved 2>/dev/null || true", - "sudo mkdir -p /opt/opensandbox/images /usr/local/bin /etc/opensandbox", - ] - } - - # Upload worker + agent binaries + # 1. Upload pre-built binaries. provisioner "file" { source = var.worker_binary destination = "/tmp/opensandbox-worker" @@ -140,76 +160,116 @@ build { source = var.agent_binary destination = "/tmp/osb-agent" } + + # 2. Upload rootfs build context. + provisioner "file" { + source = var.rootfs_context + destination = "/tmp/rootfs-ctx.tar.gz" + } + + # 3. Upload the EC2 worker systemd unit (the Azure variant uses a different + # unit; the EC2 one was already drafted at deploy/ec2/opensandbox-worker.service). + provisioner "file" { + source = "deploy/ec2/opensandbox-worker.service" + destination = "/tmp/opensandbox-worker.service" + } + + # 4. Upload Vector config + populator. provisioner "file" { - source = var.kernel_path - destination = "/tmp/vmlinux" + source = "deploy/vector/" + destination = "/tmp/vector/" } + # 5. Run the (misleadingly-named-but-cloud-agnostic) setup script. Installs + # QEMU, kernel modules, Docker for rootfs build, Vector, systemd units. provisioner "shell" { - inline = [ - "sudo install -m 0755 /tmp/opensandbox-worker /usr/local/bin/opensandbox-worker", - "sudo install -m 0755 /tmp/osb-agent /usr/local/bin/osb-agent", - "sudo install -m 0644 /tmp/vmlinux /opt/opensandbox/vmlinux", - ] + execute_command = "chmod +x {{ .Path }}; {{ .Vars }} sudo -E bash '{{ .Path }}'" + script = "deploy/azure/setup-azure-host.sh" } - # Pull the canonical guest rootfs from R2/S3 if configured. + # 6. AWS-specific: install awscli (used by populate-vector-env.sh and by + # the worker user-data's shared-disk attach), then install binaries and + # build the golden rootfs. provisioner "shell" { + execute_command = "chmod +x {{ .Path }}; {{ .Vars }} sudo -E bash '{{ .Path }}'" inline = [ - "set -euo pipefail", - "if [ -n '${var.guest_image_url}' ] && [ -n '${var.guest_image_version}' ]; then", - " echo 'Pulling guest rootfs ${var.guest_image_version} from ${var.guest_image_url}...'", - " sudo mkdir -p /opt/opensandbox/images/bases/${var.guest_image_version}", - " sudo curl -fsSL -o /opt/opensandbox/images/bases/${var.guest_image_version}/default.ext4 '${var.guest_image_url}'", - " sudo cp /opt/opensandbox/images/bases/${var.guest_image_version}/default.ext4 /opt/opensandbox/images/default.ext4", + # awscli v2 — apt's `awscli` is v1 and missing some commands we use. + "apt-get update -qq", + "apt-get install -y -qq unzip", + "curl -fsSL 'https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip' -o /tmp/awscliv2.zip", + "cd /tmp && unzip -q awscliv2.zip && ./aws/install --update", + "rm -rf /tmp/awscliv2.zip /tmp/aws", + "aws --version", + + # Install worker + agent binaries. + "mv /tmp/opensandbox-worker /usr/local/bin/opensandbox-worker", + "chmod +x /usr/local/bin/opensandbox-worker", + "mv /tmp/osb-agent /usr/local/bin/osb-agent", + "chmod +x /usr/local/bin/osb-agent", + + # Install systemd unit. + "mv /tmp/opensandbox-worker.service /etc/systemd/system/opensandbox-worker.service", + "systemctl daemon-reload", + "systemctl enable opensandbox-worker.service", + + # Build the golden rootfs (no caching for PoC — every bake builds from scratch). + "mkdir -p /tmp/rootfs-ctx", + "cd /tmp/rootfs-ctx && tar xzf /tmp/rootfs-ctx.tar.gz", + "INPUT_HASH=$({ sha256sum /usr/local/bin/osb-agent; find /tmp/rootfs-ctx -type f | sort | xargs sha256sum; sha256sum /opt/opensandbox/guest-modules/*.ko* 2>/dev/null; } | sha256sum | awk '{print $1}')", + "echo \"Rootfs input hash: $INPUT_HASH\"", + "ROOTFS_UUID=$(echo \"$INPUT_HASH\" | head -c 32 | sed 's/\\(........\\)\\(....\\)\\(....\\)\\(....\\)\\(............\\)/\\1-\\2-\\3-\\4-\\5/')", + "export ROOTFS_UUID", + "mkdir -p /data/firecracker/images /opt/opensandbox/images", + "cd /tmp/rootfs-ctx && bash deploy/ec2/build-rootfs-docker.sh /usr/local/bin/osb-agent /data/firecracker/images default", + "cp /data/firecracker/images/default.ext4 /opt/opensandbox/images/default.ext4", + + # Inject guest kernel modules into rootfs. + "GUEST_MODDIR=/opt/opensandbox/guest-modules", + "if [ -d \"$GUEST_MODDIR\" ] && [ -f /opt/opensandbox/images/default.ext4 ]; then", + " MNTDIR=$(mktemp -d)", + " mount -o loop /opt/opensandbox/images/default.ext4 $MNTDIR", + " mkdir -p $MNTDIR/lib/modules/extra", + " cp $GUEST_MODDIR/*.ko* $MNTDIR/lib/modules/extra/ 2>/dev/null || true", + " umount $MNTDIR", + " rmdir $MNTDIR", "fi", + + # Stamp the golden version (hash of the final ext4) — workers read this + # at boot to decide whether to fetch a newer golden from S3. + "GOLDEN_VERSION=$(/usr/local/bin/opensandbox-worker golden-version /opt/opensandbox/images/default.ext4 2>/dev/null || sha256sum /opt/opensandbox/images/default.ext4 | awk '{print $1}')", + "echo \"$GOLDEN_VERSION\" > /opt/opensandbox/images/golden-version", + "echo \"Golden version: $GOLDEN_VERSION\"", ] } - # systemd unit (mirrors the one written by Azure setup-host.sh) + # 7. Optional: upload the golden to S3 so the cell's shared-disk seeder + # + future per-instance prefetch path can fetch it without rebuilding. provisioner "shell" { - inline = [ - "set -euo pipefail", - "sudo tee /etc/systemd/system/opensandbox-worker.service > /dev/null <<'UNIT'", - "[Unit]", - "Description=OpenSandbox Worker (QEMU backend)", - "After=network-online.target", - "Wants=network-online.target", - "", - "[Service]", - "Type=simple", - "ExecStartPre=/sbin/modprobe vhost_vsock", - "EnvironmentFile=/etc/opensandbox/worker.env", - "ExecStart=/usr/local/bin/opensandbox-worker", - "Restart=on-failure", - "RestartSec=5", - "LimitNOFILE=1000000", - "LimitNPROC=infinity", - "KillMode=process", - "TimeoutStopSec=300", - "", - "[Install]", - "WantedBy=multi-user.target", - "UNIT", - "sudo systemctl daemon-reload", - "sudo systemctl enable opensandbox-worker", + execute_command = "chmod +x {{ .Path }}; {{ .Vars }} sudo -E bash '{{ .Path }}'" + environment_vars = [ + "GOLDEN_CACHE_BUCKET=${var.golden_cache_bucket}", + "AWS_DEFAULT_REGION=${var.region}", ] - } - - # Publish AMI ID to SSM if configured - post-processor "shell-local" { - only = ["amazon-ebs.worker"] inline = [ - "AMI_ID=$(jq -r '.builds[-1].artifact_id' manifest.json | cut -d: -f2)", - "if [ -n '${var.ssm_param_name}' ] && [ -n \"$AMI_ID\" ]; then", - " aws ssm put-parameter --region ${var.aws_region} --name '${var.ssm_param_name}' --type String --overwrite --value \"$AMI_ID\"", - " aws ssm put-parameter --region ${var.aws_region} --name \"$(dirname '${var.ssm_param_name}')/worker-ami-version\" --type String --overwrite --value '${var.worker_version}'", - " echo \"Published $AMI_ID to ${var.ssm_param_name}\"", + "set -e", + "if [ -z \"$GOLDEN_CACHE_BUCKET\" ]; then", + " echo 'No golden_cache_bucket set; skipping S3 upload (worker AMI still includes the baked golden)'", + " exit 0", "fi", + "GOLDEN_VERSION=$(cat /opt/opensandbox/images/golden-version)", + "S3_KEY=\"bases/$GOLDEN_VERSION/default.ext4\"", + "echo \"Uploading default.ext4 → s3://$GOLDEN_CACHE_BUCKET/$S3_KEY (~4GB, will take a moment)\"", + # Instance profile credentials — the bake runs on an EC2 instance and + # picks up its role via the metadata service. If the builder role + # doesn't have s3:PutObject on the cell's bucket, the upload fails + # gracefully and the AMI still works (just without S3-side hydration). + "aws s3 cp /opt/opensandbox/images/default.ext4 \"s3://$GOLDEN_CACHE_BUCKET/$S3_KEY\" || echo 'S3 upload failed — continuing (AMI golden is the only copy)'", ] } + # 8. Write a manifest so external tooling can pin to the resulting AMI ID. post-processor "manifest" { - output = "manifest.json" + output = "packer-manifest-aws.json" + strip_path = true } } diff --git a/deploy/vector/populate-vector-env.sh b/deploy/vector/populate-vector-env.sh index 96e0064c..26ac1672 100755 --- a/deploy/vector/populate-vector-env.sh +++ b/deploy/vector/populate-vector-env.sh @@ -114,42 +114,102 @@ fi [ -f /etc/opensandbox/worker.env ] && . /etc/opensandbox/worker.env # shellcheck disable=SC1091 [ -f /etc/opensandbox/server.env ] && . /etc/opensandbox/server.env -VAULT_NAME="${OPENSANDBOX_AZURE_KEY_VAULT_NAME:-}" -if [ -z "$VAULT_NAME" ]; then - log "OPENSANDBOX_AZURE_KEY_VAULT_NAME unset — host has no KV configured (e.g. dev VM without managed identity); skipping (Vector will use whatever vector.env is on disk)" - exit 0 -fi +# Cloud detection — explicit OPENSANDBOX_CLOUD wins, else probe IMDS endpoints. +detect_cloud() { + if [ -n "${OPENSANDBOX_CLOUD:-}" ]; then + echo "$OPENSANDBOX_CLOUD"; return + fi + # AWS IMDSv2 token PUT — succeeds only on AWS. + if curl -fsS -X PUT --max-time 2 \ + -H "X-aws-ec2-metadata-token-ttl-seconds: 60" \ + http://169.254.169.254/latest/api/token >/dev/null 2>&1; then + echo "aws"; return + fi + # Azure IMDS — requires Metadata: true header and api-version. + if curl -fsS -H 'Metadata: true' --max-time 2 \ + "http://169.254.169.254/metadata/instance?api-version=2018-02-01" \ + >/dev/null 2>&1; then + echo "azure"; return + fi + echo "none" +} +CLOUD=$(detect_cloud) +log "detected cloud: $CLOUD" -# IMDS → AAD token for Key Vault -IMDS_RESP=$(curl -sf -H 'Metadata: true' \ - "http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&resource=https%3A%2F%2Fvault.azure.net" \ - || true) -AAD_TOKEN=$(echo "$IMDS_RESP" | python3 -c 'import sys,json; print(json.load(sys.stdin).get("access_token",""))' 2>/dev/null) -if [ -z "$AAD_TOKEN" ]; then - log "failed to acquire IMDS token (managed identity not attached?); skipping" - exit 0 -fi +# kv_get is the cloud-agnostic secret-fetch front. The provider-specific +# setup happens once below (Azure: get an AAD token; AWS: nothing — aws CLI +# uses the EC2 instance profile via the SDK's default chain) and then each +# logical secret name is dereferenced the same way regardless of cloud. -# Helper: fetch one secret value or empty string. -kv_get() { - local name=$1 - local resp - resp=$(curl -sf -H "Authorization: Bearer $AAD_TOKEN" \ - "https://${VAULT_NAME}.vault.azure.net/secrets/${name}?api-version=7.4" \ +case "$CLOUD" in +azure) + VAULT_NAME="${OPENSANDBOX_AZURE_KEY_VAULT_NAME:-}" + if [ -z "$VAULT_NAME" ]; then + log "OPENSANDBOX_AZURE_KEY_VAULT_NAME unset — host has no KV configured (e.g. dev VM without managed identity); skipping (Vector will use whatever vector.env is on disk)" + exit 0 + fi + IMDS_RESP=$(curl -sf -H 'Metadata: true' \ + "http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&resource=https%3A%2F%2Fvault.azure.net" \ || true) - echo "$resp" | python3 -c 'import sys,json; print(json.load(sys.stdin).get("value",""))' 2>/dev/null -} + AAD_TOKEN=$(echo "$IMDS_RESP" | python3 -c 'import sys,json; print(json.load(sys.stdin).get("access_token",""))' 2>/dev/null) + if [ -z "$AAD_TOKEN" ]; then + log "failed to acquire IMDS token (managed identity not attached?); skipping" + exit 0 + fi + kv_get() { + local name=$1 + local resp + resp=$(curl -sf -H "Authorization: Bearer $AAD_TOKEN" \ + "https://${VAULT_NAME}.vault.azure.net/secrets/${name}?api-version=7.4" \ + || true) + echo "$resp" | python3 -c 'import sys,json; print(json.load(sys.stdin).get("value",""))' 2>/dev/null + } + ;; + +aws) + SECRETS_PREFIX="${OPENSANDBOX_AWS_SECRETS_PREFIX:-}" + if [ -z "$SECRETS_PREFIX" ]; then + log "OPENSANDBOX_AWS_SECRETS_PREFIX unset — host has no Secrets Manager prefix configured; skipping (Vector will use whatever vector.env is on disk)" + exit 0 + fi + if ! command -v aws >/dev/null 2>&1; then + log "aws CLI not installed in AMI — populator can't fetch from Secrets Manager. Bake awscli into the worker image (see deploy/packer/worker-ami-aws.pkr.hcl)." + exit 0 + fi + # Auto-detect region from IMDSv2 so we don't have to plumb it via env. + AWS_IMDS_TOKEN=$(curl -fsS -X PUT --max-time 2 \ + -H "X-aws-ec2-metadata-token-ttl-seconds: 60" \ + http://169.254.169.254/latest/api/token 2>/dev/null || true) + AWS_REGION_DETECTED=$(curl -fsS --max-time 2 \ + -H "X-aws-ec2-metadata-token: $AWS_IMDS_TOKEN" \ + http://169.254.169.254/latest/meta-data/placement/region 2>/dev/null || true) + export AWS_REGION="${OPENSANDBOX_REGION:-${AWS_REGION_DETECTED:-us-east-2}}" + + kv_get() { + local name=$1 + aws secretsmanager get-secret-value \ + --secret-id "${SECRETS_PREFIX}${name}" \ + --query SecretString \ + --output text 2>/dev/null || echo "" + } + ;; + +*) + log "no recognized cloud detected and OPENSANDBOX_CLOUD is empty — skipping. Vector will use whatever vector.env is on disk." + exit 0 + ;; +esac TOKEN_VALUE=$(kv_get "$TOKEN_SECRET") if [ -z "$TOKEN_VALUE" ]; then - log "secret $TOKEN_SECRET not found in $VAULT_NAME (or no access); skipping" + log "secret $TOKEN_SECRET not found in $CLOUD secret store (or no access); skipping" exit 0 fi DATASET_VALUE=$(kv_get "$DATASET_SECRET") if [ -z "$DATASET_VALUE" ]; then - log "secret $DATASET_SECRET not found in $VAULT_NAME (or no access); skipping — Vector won't have a dataset to ship to" + log "secret $DATASET_SECRET not found in $CLOUD secret store (or no access); skipping — Vector won't have a dataset to ship to" exit 0 fi @@ -169,7 +229,7 @@ fi CELL_ID_VALUE=$(kv_get "$CELL_ID_SECRET") if [ -z "$CELL_ID_VALUE" ]; then CELL_ID_VALUE="${OPENCOMPUTER_CELL_ID:-unknown}" - log "secret $CELL_ID_SECRET not found in $VAULT_NAME — falling back to OPENCOMPUTER_CELL_ID=$CELL_ID_VALUE" + log "secret $CELL_ID_SECRET not found in $CLOUD secret store — falling back to OPENCOMPUTER_CELL_ID=$CELL_ID_VALUE" fi # Auto-detect HOST_IP via the kernel's source-address selection (skips link-local). @@ -194,4 +254,4 @@ metrics_status="absent" if [ -n "$METRICS_TOKEN_VALUE" ] && [ -n "$METRICS_DATASET_VALUE" ]; then metrics_status="present" fi -log "populated $ENV_FILE (logs token+dataset from $VAULT_NAME, metrics=$metrics_status, cell_id=$CELL_ID_VALUE, host_ip=${HOST_IP:-unknown})" +log "populated $ENV_FILE (logs token+dataset from $CLOUD, metrics=$metrics_status, cell_id=$CELL_ID_VALUE, host_ip=${HOST_IP:-unknown})" diff --git a/deploy/worker.env.example b/deploy/worker.env.example index d4133699..6f3234e6 100644 --- a/deploy/worker.env.example +++ b/deploy/worker.env.example @@ -5,10 +5,32 @@ # Non-secret config values stay in this file. # ── Secrets Service ────────────────────────────────────────────── -# Set this to enable secrets loading from secrets service. -# When set, all values marked [KEY VAULT] below are fetched automatically. -# Remove those values from this file — they're managed in the vault. +# Set ONE of the following — whichever matches the cloud the worker runs in. +# When set, all values marked "managed in secrets service" below are fetched +# automatically and don't need to be set here. +# +# Azure Key Vault — vault name (e.g. opencomputer-prod-kv). SECRETS_VAULT_NAME=opencomputer-prod-kv +# OPENSANDBOX_AZURE_KEY_VAULT_NAME=opencomputer-prod-kv # legacy alias for SECRETS_VAULT_NAME +# +# AWS Secrets Manager — prefix under which the cell's secrets live, e.g. +# "opencomputer/aws-us-east-2-poc/". The provider lists secrets under this +# prefix and dereferences each name in secretMapping (internal/config/secrets.go). +# OPENSANDBOX_AWS_SECRETS_PREFIX=opencomputer/aws-us-east-2-poc/ + +# Cloud identifier — set by cloud-init from terraform on AWS workers. Used by +# deploy/vector/populate-vector-env.sh and the Go preemption monitor to pick +# the right backend. Leave unset for local dev (script auto-detects via IMDS). +# OPENSANDBOX_CLOUD=aws + +# CPU overcommit ratio — multiplies the physical-vCPU-derived MAX_CAPACITY +# before the worker advertises capacity to the CP via Redis heartbeat. Memory +# is never overcommitted because host OOM is a worse failure mode than CPU +# contention. Recommended values: +# 1 no overcommit (Azure default, predictable performance) +# 2 ~36% cheaper per sandbox on AWS r8i for agent workloads (typical default) +# 3 aggressive — only if sandboxes are dominantly idle (LLM-bound) +# OPENSANDBOX_CPU_OVERCOMMIT_RATIO=1 # ── Worker Config (not secret — stays in this file) ────────────── OPENSANDBOX_MODE=worker diff --git a/internal/config/config.go b/internal/config/config.go index 8a314487..b78ba625 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -171,6 +171,25 @@ type Config struct { // Env vars take precedence over secret values (for local overrides). SecretsARN string + // AWSSecretsPrefix is the per-cell secret name prefix in AWS Secrets + // Manager, e.g. "opencomputer/aws-us-east-2-poc/". When set, LoadSecrets + // selects the AWS Secrets Manager provider and lists+loads everything + // under this prefix. Mirrors AzureKeyVaultName for the AWS path. + AWSSecretsPrefix string + + // Cloud is a short identifier for the cloud the binary is running on + // ("aws", "azure", or empty for local/dev). Set by cloud-init from + // terraform — preferred over runtime probing for fast startup. + Cloud string + + // CPUOvercommitRatio multiplies the worker's physical vCPU count when + // publishing capacity in the Redis heartbeat. 1 = no overcommit + // (matches Azure default). 2 = recommended for agent workloads on AWS + // — roughly halves $/sandbox without measurable UX impact for + // LLM-bound sandboxes. Memory is never overcommitted because host OOM + // is a worse failure mode than CPU contention. + CPUOvercommitRatio int + // Secret encryption key (hex-encoded 32 bytes / 64 hex chars) for encrypting // project secrets at rest in PostgreSQL. Required if using project secrets. SecretEncryptionKey string @@ -342,6 +361,9 @@ func Load() (*Config, error) { AzureSubnetID: os.Getenv("OPENSANDBOX_AZURE_SUBNET_ID"), AzureSSHPublicKey: os.Getenv("OPENSANDBOX_AZURE_SSH_PUBLIC_KEY"), AzureKeyVaultName: os.Getenv("OPENSANDBOX_AZURE_KEY_VAULT_NAME"), + AWSSecretsPrefix: os.Getenv("OPENSANDBOX_AWS_SECRETS_PREFIX"), + Cloud: os.Getenv("OPENSANDBOX_CLOUD"), + CPUOvercommitRatio: envOrDefaultInt("OPENSANDBOX_CPU_OVERCOMMIT_RATIO", 1), AzureWorkerIdentityID: os.Getenv("OPENSANDBOX_AZURE_WORKER_IDENTITY_ID"), CFAPIToken: os.Getenv("OPENSANDBOX_CF_API_TOKEN"), @@ -418,6 +440,20 @@ func Load() (*Config, error) { cfg.CellID = cfg.Region + "-default" } + // CPU overcommit: multiply the physical-capacity ceiling by the + // configured ratio so the worker advertises (and the CP places onto) + // the inflated slot count. Ratio < 1 is treated as 1 (the operator + // cannot under-commit via this knob — set MaxCapacity directly for + // that). Applied after all loading so callers see one consistent + // number; the heartbeat, gRPC capacity guard, and any future consumer + // all agree. + if cfg.CPUOvercommitRatio < 1 { + cfg.CPUOvercommitRatio = 1 + } + if cfg.CPUOvercommitRatio > 1 && cfg.MaxCapacity > 0 { + cfg.MaxCapacity = cfg.MaxCapacity * cfg.CPUOvercommitRatio + } + if portStr := os.Getenv("OPENSANDBOX_PORT"); portStr != "" { port, err := strconv.Atoi(portStr) if err != nil { diff --git a/internal/config/keyvault.go b/internal/config/keyvault.go index 2d4b7aab..09dd5d11 100644 --- a/internal/config/keyvault.go +++ b/internal/config/keyvault.go @@ -1,188 +1,84 @@ -// Package config provides configuration loading from Azure Key Vault. -// -// If SECRETS_VAULT_NAME is set, LoadSecretsFromKeyVault fetches all secrets -// from the vault and maps them to environment variables. The mapping is: -// -// Key Vault secret name → Environment variable -// server-database-url → OPENSANDBOX_DATABASE_URL -// server-jwt-secret → OPENSANDBOX_JWT_SECRET -// worker-s3-secret-key → OPENSANDBOX_S3_SECRET_ACCESS_KEY -// ...etc -// -// Secrets already set in the environment are NOT overwritten — env vars take -// precedence over Key Vault. This allows local overrides for development. +// Package config — Azure Key Vault implementation of SecretsProvider. // // Authentication uses Azure Default Credential (Managed Identity on VMs, -// CLI credentials locally). No explicit credentials needed. +// CLI credentials locally). The trigger env var is OPENSANDBOX_AZURE_KEY_VAULT_NAME +// (legacy: SECRETS_VAULT_NAME); LoadSecrets() in secrets.go selects this +// provider when either is set. + package config import ( "context" + "fmt" + "log" "os" - "time" - "github.com/opensandbox/opensandbox/internal/secrets" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + "github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/azsecrets" ) -// kvMapping maps Key Vault entry names to environment variable names. Despite -// the historical "secret" terminology this includes both real secrets (DB -// passwords, JWT keys) and non-secret cell config (region, cell_id, capacity -// tuning). The principle: only bootstrap pointers + per-VM identity stay in -// the env file; everything else is in KV so a cell's configuration is one -// source-of-truth rather than scattered across N worker.env files. -// -// Entries not in this map are silently ignored — the allowlist is the -// safety guard preventing a stray vault entry from accidentally shadowing -// an unrelated env var (e.g., PATH). -var kvMapping = map[string]string{ - // Server secrets - "server-database-url": "OPENSANDBOX_DATABASE_URL", - "server-redis-url": "OPENSANDBOX_REDIS_URL", - "server-jwt-secret": "OPENSANDBOX_JWT_SECRET", - "server-api-key": "OPENSANDBOX_API_KEY", - "server-secret-encryption-key": "OPENSANDBOX_SECRET_ENCRYPTION_KEY", - "server-workos-api-key": "WORKOS_API_KEY", - "server-workos-client-id": "WORKOS_CLIENT_ID", - "server-cf-api-token": "OPENSANDBOX_CF_API_TOKEN", - "server-cf-zone-id": "OPENSANDBOX_CF_ZONE_ID", - "server-stripe-secret-key": "STRIPE_SECRET_KEY", - "server-stripe-webhook-secret": "STRIPE_WEBHOOK_SECRET", - "server-sentry-dsn": "OPENSANDBOX_SENTRY_DSN", - // Machine-size fallback lists (PR #209). Comma-separated ranked - // instance types the autoscaler tries in order on quota / capacity - // errors. Empty value = use the single VMSize / InstanceType - // configured on the pool (pre-fallback behavior). - "server-azure-vm-sizes": "OPENSANDBOX_AZURE_VM_SIZES", - "server-ec2-instance-types": "OPENSANDBOX_EC2_INSTANCE_TYPES", - // Legacy Axiom mappings — kept for backwards compat with existing prod - // KVs that pre-date the `shared-` prefix. New deploys should use - // `shared-axiom-*` instead. Safe to leave: in server mode only - // `server-axiom-*` is loaded; in worker mode only `worker-axiom-*`. New - // `shared-*` mappings below win for new envs that have only those. - "server-axiom-query-token": "AXIOM_QUERY_TOKEN", - "server-axiom-dataset": "AXIOM_DATASET", +// azureKeyVaultProvider fetches secrets by listing the vault and dereferencing +// every name that matches secretMapping for the current mode. +type azureKeyVaultProvider struct { + vaultName string +} - // Server-side cell config + shared secrets. These mirror the worker-* keys - // of the same name — both sides need them, and the prefix filter loads only - // secrets for the current mode. When dev1/prod consolidate to cell-* we can - // drop the duplicates; for now this keeps the layout symmetric and explicit. - "server-cell-id": "OPENSANDBOX_CELL_ID", - "server-region": "OPENSANDBOX_REGION", - "server-sandbox-domain": "OPENSANDBOX_SANDBOX_DOMAIN", - "server-cf-event-endpoint": "OPENSANDBOX_CF_EVENT_ENDPOINT", - "server-cf-event-secret": "OPENSANDBOX_CF_EVENT_SECRET", - "server-cf-admin-secret": "OPENSANDBOX_CF_ADMIN_SECRET", - "server-session-jwt-secret": "OPENSANDBOX_SESSION_JWT_SECRET", - "server-halt-list-url": "OPENSANDBOX_HALT_LIST_URL", +func (p *azureKeyVaultProvider) Name() string { return "azure-keyvault" } - // Worker secrets - "worker-jwt-secret": "OPENSANDBOX_JWT_SECRET", - "worker-database-url": "OPENSANDBOX_DATABASE_URL", - "worker-redis-url": "OPENSANDBOX_REDIS_URL", - "worker-s3-access-key": "OPENSANDBOX_S3_ACCESS_KEY_ID", - "worker-s3-secret-key": "OPENSANDBOX_S3_SECRET_ACCESS_KEY", - "worker-sentry-dsn": "OPENSANDBOX_SENTRY_DSN", - "worker-axiom-ingest-token": "AXIOM_INGEST_TOKEN", // legacy; superseded by shared-axiom-ingest-token - "worker-axiom-dataset": "AXIOM_DATASET", // legacy; superseded by shared-axiom-dataset +func (p *azureKeyVaultProvider) Load(ctx context.Context, mode string) (int, int, error) { + vaultURL := fmt.Sprintf("https://%s.vault.azure.net/", p.vaultName) - // Worker per-cell config (non-secret but cell-scoped — every worker in the - // cell shares these, so KV is the single source of truth) - "worker-region": "OPENSANDBOX_REGION", - "worker-cell-id": "OPENSANDBOX_CELL_ID", - "worker-max-capacity": "OPENSANDBOX_MAX_CAPACITY", - "worker-default-sandbox-memory-mb": "OPENSANDBOX_DEFAULT_SANDBOX_MEMORY_MB", - "worker-default-sandbox-cpus": "OPENSANDBOX_DEFAULT_SANDBOX_CPUS", - "worker-default-sandbox-disk-mb": "OPENSANDBOX_DEFAULT_SANDBOX_DISK_MB", - "worker-sandbox-domain": "OPENSANDBOX_SANDBOX_DOMAIN", - "worker-s3-bucket": "OPENSANDBOX_S3_BUCKET", - "worker-s3-region": "OPENSANDBOX_S3_REGION", - "worker-s3-endpoint": "OPENSANDBOX_S3_ENDPOINT", - "worker-s3-force-path-style": "OPENSANDBOX_S3_FORCE_PATH_STYLE", - "worker-cf-event-endpoint": "OPENSANDBOX_CF_EVENT_ENDPOINT", - "worker-halt-list-url": "OPENSANDBOX_HALT_LIST_URL", - "worker-segment-write-key": "SEGMENT_WRITE_KEY", + cred, err := azidentity.NewDefaultAzureCredential(nil) + if err != nil { + return 0, 0, fmt.Errorf("keyvault: azure credential: %w", err) + } - // CF-cutover event pipe (worker) - "worker-cf-event-secret": "OPENSANDBOX_CF_EVENT_SECRET", - "worker-cf-admin-secret": "OPENSANDBOX_CF_ADMIN_SECRET", - "worker-session-jwt-secret": "OPENSANDBOX_SESSION_JWT_SECRET", + client, err := azsecrets.NewClient(vaultURL, cred, nil) + if err != nil { + return 0, 0, fmt.Errorf("keyvault: client: %w", err) + } - // Phase 2 — global blob store (Tigris primary) - "worker-global-blob-name": "OPENSANDBOX_GLOBAL_BLOB_NAME", - "worker-global-blob-endpoint": "OPENSANDBOX_GLOBAL_BLOB_ENDPOINT", - "worker-global-blob-region": "OPENSANDBOX_GLOBAL_BLOB_REGION", - "worker-global-blob-access-key-id": "OPENSANDBOX_GLOBAL_BLOB_ACCESS_KEY_ID", - "worker-global-blob-secret-access-key": "OPENSANDBOX_GLOBAL_BLOB_SECRET_ACCESS_KEY", - "worker-global-blob-goldens-bucket": "OPENSANDBOX_GLOBAL_BLOB_GOLDENS_BUCKET", - "worker-global-blob-templates-bucket": "OPENSANDBOX_GLOBAL_BLOB_TEMPLATES_BUCKET", - "worker-global-blob-events-bucket": "OPENSANDBOX_GLOBAL_BLOB_EVENTS_BUCKET", + loaded, skipped := 0, 0 - // Phase 2 — global blob store (optional fallback) - "worker-global-blob-fallback-name": "OPENSANDBOX_GLOBAL_BLOB_FALLBACK_NAME", - "worker-global-blob-fallback-endpoint": "OPENSANDBOX_GLOBAL_BLOB_FALLBACK_ENDPOINT", - "worker-global-blob-fallback-region": "OPENSANDBOX_GLOBAL_BLOB_FALLBACK_REGION", - "worker-global-blob-fallback-access-key-id": "OPENSANDBOX_GLOBAL_BLOB_FALLBACK_ACCESS_KEY_ID", - "worker-global-blob-fallback-secret-access-key": "OPENSANDBOX_GLOBAL_BLOB_FALLBACK_SECRET_ACCESS_KEY", + pager := client.NewListSecretPropertiesPager(nil) + for pager.More() { + page, err := pager.NextPage(ctx) + if err != nil { + return loaded, skipped, fmt.Errorf("keyvault: list secrets: %w", err) + } - // Shared (mode-agnostic — loaded in both server and worker) - "pg-password": "OPENSANDBOX_PG_PASSWORD", - "shared-axiom-ingest-token": "AXIOM_INGEST_TOKEN", - "shared-axiom-query-token": "AXIOM_QUERY_TOKEN", - "shared-axiom-dataset": "AXIOM_DATASET", - // Platform-logs: Vector reads these from /etc/opensandbox/vector.env, - // populated by populate-vector-env.service via its own IMDS+KV REST call - // (not by this Go-side loader, because Vector starts as its own systemd - // unit before the Go binary). The entries here exist for two reasons: - // 1. Discoverability — kvMapping is the single source of truth for - // "what shared-* secrets does this deployment need in KV". - // 2. Side-effect: the Go binary ALSO loads them into its own env at - // startup; future Go code that wants to surface platform-stream - // config (e.g. an admin endpoint) gets them for free. - "shared-axiom-platform-ingest-token": "AXIOM_PLATFORM_TOKEN", - "shared-axiom-platform-dataset": "AXIOM_PLATFORM_DATASET", - // Cell identifier — stamped on every log + metric event so platform - // dashboards can filter per cell. Same dual-consumer pattern as the - // platform-* secrets above: Vector reads it from /etc/opensandbox/vector.env - // (written by populate-vector-env.sh) for its remap substitutions, and - // the Go binary reads it from cfg.CellID (which falls back to - // "-default" when this isn't in KV — see config.go). - "shared-cell-id": "OPENSANDBOX_CELL_ID", + for _, prop := range page.Value { + name := prop.ID.Name() + envVar, mapped := secretMapping[name] + if !mapped { + continue + } + if !shouldLoadForMode(name, mode) { + continue + } + // Skip the network round-trip when the env is already set — + // callers can override locally without paying for the GET. + if os.Getenv(envVar) != "" { + skipped++ + continue + } - // Edge integration (mode-agnostic — both server and worker need them). - // cf-edge-base-url is consumed by the CP's edgeclient for HMAC'd - // /internal/templates + /internal/secret-stores lookups; secret-encryption-key - // is the AES-256-GCM key shared with the api-edge Worker so the edge can - // encrypt secret-store entries and any cell can decrypt them. - "shared-cf-edge-base-url": "OPENSANDBOX_CF_EDGE_BASE_URL", - "shared-secret-encryption-key": "OPENSANDBOX_SECRET_ENCRYPTION_KEY", -} + resp, err := client.GetSecret(ctx, name, "", nil) + if err != nil { + log.Printf("keyvault: failed to get secret %s: %v (skipping)", name, err) + continue + } + if resp.Value == nil { + continue + } -// LoadSecretsFromKeyVault fetches secrets from Azure Key Vault and sets them -// as environment variables. Only loads secrets relevant to the current mode -// (server or worker), determined by the secret name prefix. -// -// Skips secrets that are already set in the environment — local env wins -// (the .env file is the "emergency override" path). -// -// Does nothing if SECRETS_VAULT_NAME is not set. Now delegates to the -// internal/secrets KeyVaultBackend so server, worker, and the AzurePool -// runtime image-refresh path share one implementation. -func LoadSecretsFromKeyVault() error { - vaultName := os.Getenv("SECRETS_VAULT_NAME") - if vaultName == "" { - return nil // Key Vault not configured — use env file as-is + if setIfUnset(envVar, *resp.Value) { + loaded++ + } else { + skipped++ + } + } } - mode := os.Getenv("OPENSANDBOX_MODE") // "server" or "worker" - be, err := secrets.NewKeyVaultBackend(vaultName, kvMapping, mode) - if err != nil { - return err - } - if be == nil { - return nil // shouldn't happen — vaultName non-empty was checked above - } - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - _, _, err = be.LoadAllToEnv(ctx) - return err + return loaded, skipped, nil } diff --git a/internal/config/secrets.go b/internal/config/secrets.go new file mode 100644 index 00000000..64943160 --- /dev/null +++ b/internal/config/secrets.go @@ -0,0 +1,213 @@ +// Package config — multi-cloud secret loading. +// +// A SecretsProvider knows how to fetch secrets from a cloud secret store and +// populate them as environment variables, honoring the same conventions: +// +// - secretMapping is the single source of truth for "what logical secret +// name maps to what env var". Both Azure Key Vault and AWS Secrets +// Manager providers use the same map; the difference is the transport. +// - Mode-prefix filtering: only `-*`, `pg-*`, and `shared-*` secrets +// are loaded, where mode is the value of OPENSANDBOX_MODE ("server" or +// "worker"). Without this a server would try to load worker-only +// secrets and vice versa. +// - Env-var precedence: secrets already set in os.Environ() are NOT +// overwritten. Lets local dev shells override. +// +// LoadSecrets() is the entrypoint — it picks a provider by environment: +// +// OPENSANDBOX_AZURE_KEY_VAULT_NAME or SECRETS_VAULT_NAME → Azure +// OPENSANDBOX_AWS_SECRETS_PREFIX → AWS Secrets Manager +// (neither set) → no-op +// +// Both env vars set is treated as "both providers run in order" — useful for +// migrations. + +package config + +import ( + "context" + "log" + "os" + "strings" + "time" +) + +// SecretsProvider is implemented by per-cloud secret loaders. Implementations +// must honor mode-prefix filtering and env-var precedence; see Load doc. +type SecretsProvider interface { + // Name returns a short identifier used only for log messages. + Name() string + + // Load fetches secrets and sets matching env vars. Returns (loaded, + // skipped, err). `mode` is the value of OPENSANDBOX_MODE; pass "" + // for mode-agnostic loaders. + Load(ctx context.Context, mode string) (loaded int, skipped int, err error) +} + +// secretMapping is the canonical map of "logical secret name → env var +// name". Implementations look up each fetched secret's logical name here +// to decide what env var to set. +// +// Logical secret names are prefixed: +// +// server-* loaded only when OPENSANDBOX_MODE=server +// worker-* loaded only when OPENSANDBOX_MODE=worker +// pg-* grandfathered shared (Postgres password) — loaded in both modes +// shared-* mode-agnostic; loaded in both modes +// +// Backends apply different transports for fetching these by name (KV list- +// then-get, SM list-by-prefix-then-get). The names themselves are the same +// across clouds — operators see the same logical inventory regardless of +// where the cell runs. +var secretMapping = map[string]string{ + // ----- Server secrets ----- + "server-database-url": "OPENSANDBOX_DATABASE_URL", + "server-redis-url": "OPENSANDBOX_REDIS_URL", + "server-jwt-secret": "OPENSANDBOX_JWT_SECRET", + "server-api-key": "OPENSANDBOX_API_KEY", + "server-secret-encryption-key": "OPENSANDBOX_SECRET_ENCRYPTION_KEY", + "server-workos-api-key": "WORKOS_API_KEY", + "server-workos-client-id": "WORKOS_CLIENT_ID", + "server-cf-api-token": "OPENSANDBOX_CF_API_TOKEN", + "server-cf-zone-id": "OPENSANDBOX_CF_ZONE_ID", + "server-stripe-secret-key": "STRIPE_SECRET_KEY", + "server-stripe-webhook-secret": "STRIPE_WEBHOOK_SECRET", + "server-sentry-dsn": "OPENSANDBOX_SENTRY_DSN", + "server-azure-vm-sizes": "OPENSANDBOX_AZURE_VM_SIZES", + "server-ec2-instance-types": "OPENSANDBOX_EC2_INSTANCE_TYPES", + + // Legacy Axiom mappings — kept for backwards compat with existing prod + // KVs that pre-date the `shared-` prefix. New deploys should use + // `shared-axiom-*` instead. + "server-axiom-query-token": "AXIOM_QUERY_TOKEN", + "server-axiom-dataset": "AXIOM_DATASET", + + // Server-side cell config + shared secrets. + "server-cell-id": "OPENSANDBOX_CELL_ID", + "server-region": "OPENSANDBOX_REGION", + "server-sandbox-domain": "OPENSANDBOX_SANDBOX_DOMAIN", + "server-cf-event-endpoint": "OPENSANDBOX_CF_EVENT_ENDPOINT", + "server-cf-event-secret": "OPENSANDBOX_CF_EVENT_SECRET", + "server-cf-admin-secret": "OPENSANDBOX_CF_ADMIN_SECRET", + "server-session-jwt-secret": "OPENSANDBOX_SESSION_JWT_SECRET", + "server-halt-list-url": "OPENSANDBOX_HALT_LIST_URL", + + // ----- Worker secrets ----- + "worker-jwt-secret": "OPENSANDBOX_JWT_SECRET", + "worker-database-url": "OPENSANDBOX_DATABASE_URL", + "worker-redis-url": "OPENSANDBOX_REDIS_URL", + "worker-s3-access-key": "OPENSANDBOX_S3_ACCESS_KEY_ID", + "worker-s3-secret-key": "OPENSANDBOX_S3_SECRET_ACCESS_KEY", + "worker-sentry-dsn": "OPENSANDBOX_SENTRY_DSN", + "worker-axiom-ingest-token": "AXIOM_INGEST_TOKEN", + "worker-axiom-dataset": "AXIOM_DATASET", + + // Worker per-cell config. + "worker-region": "OPENSANDBOX_REGION", + "worker-cell-id": "OPENSANDBOX_CELL_ID", + "worker-max-capacity": "OPENSANDBOX_MAX_CAPACITY", + "worker-default-sandbox-memory-mb": "OPENSANDBOX_DEFAULT_SANDBOX_MEMORY_MB", + "worker-default-sandbox-cpus": "OPENSANDBOX_DEFAULT_SANDBOX_CPUS", + "worker-default-sandbox-disk-mb": "OPENSANDBOX_DEFAULT_SANDBOX_DISK_MB", + "worker-sandbox-domain": "OPENSANDBOX_SANDBOX_DOMAIN", + "worker-s3-bucket": "OPENSANDBOX_S3_BUCKET", + "worker-s3-region": "OPENSANDBOX_S3_REGION", + "worker-s3-endpoint": "OPENSANDBOX_S3_ENDPOINT", + "worker-s3-force-path-style": "OPENSANDBOX_S3_FORCE_PATH_STYLE", + "worker-cf-event-endpoint": "OPENSANDBOX_CF_EVENT_ENDPOINT", + "worker-halt-list-url": "OPENSANDBOX_HALT_LIST_URL", + "worker-segment-write-key": "SEGMENT_WRITE_KEY", + "worker-cf-event-secret": "OPENSANDBOX_CF_EVENT_SECRET", + "worker-cf-admin-secret": "OPENSANDBOX_CF_ADMIN_SECRET", + "worker-session-jwt-secret": "OPENSANDBOX_SESSION_JWT_SECRET", + "worker-global-blob-name": "OPENSANDBOX_GLOBAL_BLOB_NAME", + "worker-global-blob-endpoint": "OPENSANDBOX_GLOBAL_BLOB_ENDPOINT", + "worker-global-blob-region": "OPENSANDBOX_GLOBAL_BLOB_REGION", + "worker-global-blob-access-key-id": "OPENSANDBOX_GLOBAL_BLOB_ACCESS_KEY_ID", + "worker-global-blob-secret-access-key": "OPENSANDBOX_GLOBAL_BLOB_SECRET_ACCESS_KEY", + "worker-global-blob-goldens-bucket": "OPENSANDBOX_GLOBAL_BLOB_GOLDENS_BUCKET", + "worker-global-blob-templates-bucket": "OPENSANDBOX_GLOBAL_BLOB_TEMPLATES_BUCKET", + "worker-global-blob-events-bucket": "OPENSANDBOX_GLOBAL_BLOB_EVENTS_BUCKET", + "worker-global-blob-fallback-name": "OPENSANDBOX_GLOBAL_BLOB_FALLBACK_NAME", + "worker-global-blob-fallback-endpoint": "OPENSANDBOX_GLOBAL_BLOB_FALLBACK_ENDPOINT", + "worker-global-blob-fallback-region": "OPENSANDBOX_GLOBAL_BLOB_FALLBACK_REGION", + "worker-global-blob-fallback-access-key-id": "OPENSANDBOX_GLOBAL_BLOB_FALLBACK_ACCESS_KEY_ID", + "worker-global-blob-fallback-secret-access-key": "OPENSANDBOX_GLOBAL_BLOB_FALLBACK_SECRET_ACCESS_KEY", + + // ----- Shared / mode-agnostic ----- + "pg-password": "OPENSANDBOX_PG_PASSWORD", + "shared-axiom-ingest-token": "AXIOM_INGEST_TOKEN", + "shared-axiom-query-token": "AXIOM_QUERY_TOKEN", + "shared-axiom-dataset": "AXIOM_DATASET", + "shared-axiom-platform-ingest-token": "AXIOM_PLATFORM_TOKEN", + "shared-axiom-platform-dataset": "AXIOM_PLATFORM_DATASET", + "shared-cell-id": "OPENSANDBOX_CELL_ID", + "shared-cf-edge-base-url": "OPENSANDBOX_CF_EDGE_BASE_URL", + "shared-secret-encryption-key": "OPENSANDBOX_SECRET_ENCRYPTION_KEY", +} + +// shouldLoadForMode returns true if the given logical secret name applies to +// the running mode. Empty mode loads everything; otherwise only matching +// prefixes plus `pg-*` and `shared-*` pass. +func shouldLoadForMode(name, mode string) bool { + if mode == "" { + return true + } + return strings.HasPrefix(name, mode+"-") || + strings.HasPrefix(name, "pg-") || + strings.HasPrefix(name, "shared-") +} + +// setIfUnset writes name=value to the process environment, but only when +// name is not already set. Returns true if the value was applied. +func setIfUnset(name, value string) bool { + if os.Getenv(name) != "" { + return false + } + _ = os.Setenv(name, value) + return true +} + +// LoadSecrets picks a provider by environment and loads secrets from it. +// Mode is read from OPENSANDBOX_MODE. Returns nil and does nothing if no +// provider is configured — local dev with a worker.env file works as-is. +// +// If both Azure and AWS providers are configured (unusual but legal during +// a cross-cloud migration), both run in declaration order. +func LoadSecrets() error { + mode := os.Getenv("OPENSANDBOX_MODE") + + var providers []SecretsProvider + if name := azureVaultName(); name != "" { + providers = append(providers, &azureKeyVaultProvider{vaultName: name}) + } + if prefix := os.Getenv("OPENSANDBOX_AWS_SECRETS_PREFIX"); prefix != "" { + providers = append(providers, &awsSecretsManagerProvider{prefix: prefix}) + } + + if len(providers) == 0 { + return nil + } + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + for _, p := range providers { + loaded, skipped, err := p.Load(ctx, mode) + if err != nil { + return err + } + log.Printf("secrets: %s loaded %d, skipped %d (already set)", p.Name(), loaded, skipped) + } + return nil +} + +// azureVaultName reads the Azure KV name from either of the two env vars +// historically used. Kept here so both `secrets.go` and `keyvault.go` see +// the same lookup precedence. +func azureVaultName() string { + if v := os.Getenv("OPENSANDBOX_AZURE_KEY_VAULT_NAME"); v != "" { + return v + } + return os.Getenv("SECRETS_VAULT_NAME") +} diff --git a/internal/config/secretsmanager.go b/internal/config/secretsmanager.go index 69ef2d53..a91b512c 100644 --- a/internal/config/secretsmanager.go +++ b/internal/config/secretsmanager.go @@ -1,58 +1,114 @@ +// Package config — AWS Secrets Manager implementation of SecretsProvider. +// +// Authentication uses the AWS Default Credential chain (EC2 IAM role on +// instances, AWS_PROFILE / SSO / env locally). The trigger env var is +// OPENSANDBOX_AWS_SECRETS_PREFIX; LoadSecrets() in secrets.go selects this +// provider when it is set. +// +// Layout: secrets are stored under a flat per-cell prefix, e.g. +// +// opencomputer/aws-us-east-2-poc/worker-jwt-secret +// opencomputer/aws-us-east-2-poc/worker-redis-url +// opencomputer/aws-us-east-2-poc/shared-axiom-ingest-token +// +// The provider lists everything under the prefix, strips it, looks up the +// remaining logical name in secretMapping (cloud-agnostic, defined in +// secrets.go), and sets the matching env var if not already populated. + package config import ( "context" + "fmt" + "log" "os" - "time" + "strings" - "github.com/opensandbox/opensandbox/internal/secrets" + "github.com/aws/aws-sdk-go-v2/aws" + awsconfig "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/service/secretsmanager" + "github.com/aws/aws-sdk-go-v2/service/secretsmanager/types" ) -// LoadSecretsFromSecretsManager is the AWS analogue of LoadSecretsFromKeyVault. -// Maps every kebab-case key from kvMapping into the process environment via -// the same translation Azure KV uses (server-jwt-secret → OPENSANDBOX_JWT_SECRET). -// -// Two trigger env vars, picked one at a time: -// -// - OPENSANDBOX_SECRETS_ARN: single bundled SM secret with JSON SecretString. -// One GetSecretValue, one IAM scope. Cheap ($0.40/mo) but requires the -// producer (Infisical or otherwise) to write a JSON bundle. -// -// - OPENSANDBOX_SECRETS_AWS_REGION: per-key mode. Each kvMapping key is its -// own SM secret. BatchGetSecretValue fetches them in chunks of 20. This -// is what Infisical's "Sync each secret to its own secret" mode produces. -// Costs $0.40/mo per secret but matches Infisical's default sync shape. -// -// Existing env vars take precedence (emergency override path). -// -// Does nothing if neither trigger is set. -func LoadSecretsFromSecretsManager() error { - bundleARN := os.Getenv("OPENSANDBOX_SECRETS_ARN") - listRegion := os.Getenv("OPENSANDBOX_SECRETS_AWS_REGION") - if bundleARN == "" && listRegion == "" { - return nil // not configured; defer to env file / Azure KV / combined mode - } - mode := os.Getenv("OPENSANDBOX_MODE") +// awsSecretsManagerProvider fetches secrets by listing the cell's prefix and +// dereferencing every name that matches secretMapping for the current mode. +type awsSecretsManagerProvider struct { + prefix string +} - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() +func (p *awsSecretsManagerProvider) Name() string { return "aws-secretsmanager" } - be, err := secrets.NewSecretsManagerBackend(ctx, listRegion, bundleARN) +func (p *awsSecretsManagerProvider) Load(ctx context.Context, mode string) (int, int, error) { + cfg, err := awsconfig.LoadDefaultConfig(ctx) if err != nil { - return err + return 0, 0, fmt.Errorf("secretsmanager: load aws config: %w", err) } - be.NameMap = kvMapping - be.ModePrefixFilter = mode + client := secretsmanager.NewFromConfig(cfg) - if bundleARN != "" { - _, _, err = be.LoadAllToEnv(ctx) - return err - } - // list/per-key mode — fetch each kvMapping key as its own SM secret - names := make([]string, 0, len(kvMapping)) - for k := range kvMapping { - names = append(names, k) + loaded, skipped := 0, 0 + + var nextToken *string + for { + out, err := client.ListSecrets(ctx, &secretsmanager.ListSecretsInput{ + MaxResults: aws.Int32(100), + NextToken: nextToken, + Filters: []types.Filter{ + { + Key: types.FilterNameStringTypeName, + Values: []string{p.prefix}, + }, + }, + }) + if err != nil { + return loaded, skipped, fmt.Errorf("secretsmanager: list: %w", err) + } + + for _, entry := range out.SecretList { + if entry.Name == nil { + continue + } + fullName := *entry.Name + // `name` filter is a prefix-match; defensively strip and skip if not ours. + if !strings.HasPrefix(fullName, p.prefix) { + continue + } + logicalName := strings.TrimPrefix(fullName, p.prefix) + + envVar, mapped := secretMapping[logicalName] + if !mapped { + continue + } + if !shouldLoadForMode(logicalName, mode) { + continue + } + if os.Getenv(envVar) != "" { + skipped++ + continue + } + + val, err := client.GetSecretValue(ctx, &secretsmanager.GetSecretValueInput{ + SecretId: aws.String(fullName), + }) + if err != nil { + log.Printf("secretsmanager: failed to get secret %s: %v (skipping)", logicalName, err) + continue + } + if val.SecretString == nil { + continue + } + + if setIfUnset(envVar, *val.SecretString) { + loaded++ + } else { + skipped++ + } + } + + if out.NextToken == nil { + break + } + nextToken = out.NextToken } - _, _, err = be.LoadAllByNameList(ctx, names) - return err + + return loaded, skipped, nil } diff --git a/internal/preemption/aws.go b/internal/preemption/aws.go new file mode 100644 index 00000000..965711d3 --- /dev/null +++ b/internal/preemption/aws.go @@ -0,0 +1,172 @@ +// AWS spot interruption monitor. +// +// AWS publishes interruption notices on IMDSv2 at +// GET /latest/meta-data/spot/instance-action +// returning HTTP 200 with a JSON body shaped: +// { "action": "stop|terminate|hibernate", "time": "2026-05-19T12:34:56Z" } +// +// Until interruption is scheduled, the endpoint returns 404. We poll every +// `pollInterval` (default 5s). AWS guarantees the notice lands at least +// ~2 minutes before the instance is reclaimed, so 5s polling gives ample +// drain budget. +// +// IMDSv2 requires a session token (PUT /latest/api/token). Tokens are +// scoped per-request and expire in 6h, but we re-fetch on every poll for +// simplicity — IMDS is link-local, the round-trip cost is negligible +// compared to the network operations the worker is otherwise doing. + +package preemption + +import ( + "context" + "encoding/json" + "io" + "log" + "net/http" + "strings" + "time" +) + +type awsMonitor struct { + pollInterval time.Duration + imdsEndpoint string +} + +func (m *awsMonitor) Name() string { return "aws-imds" } + +func (m *awsMonitor) Watch(ctx context.Context) <-chan Notice { + ch := make(chan Notice, 1) + + go func() { + defer close(ch) + + // Standalone client so the global default client's settings can't + // stall our 1-second IMDS round-trips with unrelated transport + // configuration. + client := &http.Client{Timeout: 2 * time.Second} + + ticker := time.NewTicker(m.pollInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + notice, found, err := m.probe(ctx, client) + if err != nil { + // Don't spam — IMDS unreachability is interesting once + // per minute, not once per 5s. + log.Printf("preemption: aws probe error (will retry): %v", err) + continue + } + if !found { + continue + } + // Buffered channel — non-blocking send. If the consumer is + // somehow not reading, drop the second-and-later notices + // (the first one is what counts). + select { + case ch <- notice: + default: + } + // Keep polling — the action / time can change between + // notice issuance and reclaim, and we want the consumer to + // see the freshest data. Drain is idempotent. + } + } + }() + + return ch +} + +// probe issues one IMDSv2 token-then-get round trip. Returns (Notice, true, +// nil) when interruption is imminent, (zero, false, nil) when healthy, and +// (zero, false, err) on transport / decode failures. +func (m *awsMonitor) probe(ctx context.Context, client *http.Client) (Notice, bool, error) { + token, err := m.fetchToken(ctx, client) + if err != nil { + return Notice{}, false, err + } + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, + m.imdsEndpoint+"/latest/meta-data/spot/instance-action", nil) + if err != nil { + return Notice{}, false, err + } + req.Header.Set("X-aws-ec2-metadata-token", token) + + resp, err := client.Do(req) + if err != nil { + return Notice{}, false, err + } + defer resp.Body.Close() + + // 404 = healthy. AWS specifies this endpoint returns 404 until + // interruption is scheduled. + if resp.StatusCode == http.StatusNotFound { + return Notice{}, false, nil + } + if resp.StatusCode != http.StatusOK { + // Drain body to keep the connection reusable. + _, _ = io.Copy(io.Discard, resp.Body) + return Notice{}, false, nil + } + + var body struct { + Action string `json:"action"` + Time string `json:"time"` + } + if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { + return Notice{}, false, err + } + + eta, err := time.Parse(time.RFC3339, body.Time) + if err != nil { + // AWS guarantees RFC3339; if they ever change format we'd rather + // fire a Notice with the zero time than miss the signal — the + // drain budget still applies, and the caller can fallback to + // "drain now". + log.Printf("preemption: aws unexpected time format %q: %v", body.Time, err) + } + + return Notice{ + Action: Action(strings.ToLower(body.Action)), + ETA: eta, + Source: "aws-imds", + }, true, nil +} + +// fetchToken obtains an IMDSv2 session token. The token is single-use here; +// see package doc for why that's fine. +func (m *awsMonitor) fetchToken(ctx context.Context, client *http.Client) (string, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodPut, + m.imdsEndpoint+"/latest/api/token", nil) + if err != nil { + return "", err + } + req.Header.Set("X-aws-ec2-metadata-token-ttl-seconds", "60") + + resp, err := client.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + _, _ = io.Copy(io.Discard, resp.Body) + return "", &imdsHTTPError{code: resp.StatusCode} + } + + b, err := io.ReadAll(resp.Body) + if err != nil { + return "", err + } + return strings.TrimSpace(string(b)), nil +} + +type imdsHTTPError struct{ code int } + +func (e *imdsHTTPError) Error() string { + return "imds token fetch returned HTTP " + http.StatusText(e.code) +} diff --git a/internal/preemption/azure.go b/internal/preemption/azure.go new file mode 100644 index 00000000..4212a7ec --- /dev/null +++ b/internal/preemption/azure.go @@ -0,0 +1,47 @@ +// Azure Scheduled Events preemption monitor. +// +// Azure publishes scheduled-events on IMDS at +// GET /metadata/scheduledevents?api-version=2020-07-01 +// (requires header `Metadata: true`). Response shape: +// { +// "DocumentIncarnation": 42, +// "Events": [ +// { "EventId": "...", "EventType": "Preempt|Reboot|Redeploy|Freeze|Terminate", +// "ResourceType": "VirtualMachine", +// "Resources": [""], +// "EventStatus": "Scheduled|Started", +// "NotBefore": "Mon, 19 May 2026 12:34:56 GMT" } +// ] +// } +// +// "Preempt" is the spot-equivalent ("This VM is being preempted; you have +// ~30s"). "Terminate" is unscheduled-but-imminent. +// +// This is a stub for the PoC — Azure cells aren't running the new +// preemption-aware code path yet. Wiring this up later is mostly populating +// the Watch loop with the same probe-then-emit pattern as the AWS monitor. + +package preemption + +import ( + "context" + "log" + "time" +) + +type azureMonitor struct { + pollInterval time.Duration + imdsEndpoint string +} + +func (m *azureMonitor) Name() string { return "azure-scheduled-events" } + +func (m *azureMonitor) Watch(ctx context.Context) <-chan Notice { + ch := make(chan Notice, 1) + go func() { + defer close(ch) + log.Printf("preemption: azure monitor stubbed — TODO wire to /metadata/scheduledevents (see internal/preemption/azure.go)") + <-ctx.Done() + }() + return ch +} diff --git a/internal/preemption/monitor.go b/internal/preemption/monitor.go new file mode 100644 index 00000000..a150d32f --- /dev/null +++ b/internal/preemption/monitor.go @@ -0,0 +1,96 @@ +// Package preemption watches the cloud-specific spot-interruption signal and +// publishes a Notice on a channel once interruption is imminent. The worker +// binary subscribes to the channel and kicks off graceful drain: +// +// 1. Flip its Redis-heartbeat state to "draining" so the CP stops placing. +// 2. Hibernate every live sandbox to S3 in parallel (existing primitive). +// 3. Delete the heartbeat key; exit cleanly. +// +// On AWS the signal is IMDSv2 /latest/meta-data/spot/instance-action returning +// 200 with a JSON body. On Azure it's /metadata/scheduledevents. The shape of +// the work the worker does in response is identical — only the detection is +// cloud-specific. This package abstracts that. +// +// LocalCloud is decided by the OPENSANDBOX_CLOUD env var (set by cloud-init +// from terraform). If unset, the Monitor returned by NewMonitor is a no-op +// that never fires — safe default for dev / Azure-without-AWS. + +package preemption + +import ( + "context" + "log" + "os" + "time" +) + +// Action enumerates what the cloud says it will do to the instance. +type Action string + +const ( + ActionTerminate Action = "terminate" + ActionStop Action = "stop" + ActionHibernate Action = "hibernate" // AWS-only; Azure preempts only with "preempt" +) + +// Notice carries the advance-warning details. ETA is when the cloud says it +// will act; the worker has roughly ETA - now to drain. AWS guarantees ~2 min +// for spot interruption; Azure scheduled events give ~30s for preempt and +// up to 15 min for reboot/freeze. +type Notice struct { + Action Action + ETA time.Time + // Source describes where the notice came from for logging — "aws-imds", + // "azure-scheduled-events", or "redis" for CP-fanout fallback. + Source string +} + +// Monitor watches for preemption and emits a Notice on its channel when one +// arrives. Implementations must: +// - Return a buffered channel (size >= 1) so a slow consumer doesn't lose +// the notice. +// - Survive transient errors (network blips polling IMDS) — log + retry. +// - Stop cleanly when the provided context is canceled. +type Monitor interface { + Name() string + Watch(ctx context.Context) <-chan Notice +} + +// NewMonitor returns the cloud-appropriate monitor or a no-op if no cloud +// is configured. Callers should always call Watch — even the no-op returns +// a channel they can select on, simplifying wire-up. +func NewMonitor() Monitor { + switch os.Getenv("OPENSANDBOX_CLOUD") { + case "aws": + return &awsMonitor{ + pollInterval: 5 * time.Second, + imdsEndpoint: "http://169.254.169.254", + } + case "azure": + return &azureMonitor{ + pollInterval: 5 * time.Second, + imdsEndpoint: "http://169.254.169.254", + } + default: + log.Printf("preemption: no cloud configured (OPENSANDBOX_CLOUD unset) — preemption notices disabled") + return &noopMonitor{} + } +} + +// noopMonitor never emits anything. Used for dev and any deployment where +// the cloud's preemption signal isn't worth wiring up (on-demand-only, +// bare-metal-colo, etc.). +type noopMonitor struct{} + +func (noopMonitor) Name() string { return "noop" } +func (noopMonitor) Watch(ctx context.Context) <-chan Notice { + ch := make(chan Notice, 1) + // Channel left open; the caller's select will simply never fire on + // this case. Closing on ctx.Done would also work but would make + // receivers reading via "n, ok := <-ch" misinterpret it as a notice. + go func() { + <-ctx.Done() + close(ch) + }() + return ch +} diff --git a/internal/worker/redis_heartbeat.go b/internal/worker/redis_heartbeat.go index 1b53e5bd..82c22d60 100644 --- a/internal/worker/redis_heartbeat.go +++ b/internal/worker/redis_heartbeat.go @@ -5,6 +5,7 @@ import ( "encoding/json" "fmt" "log" + "sync" "time" "github.com/redis/go-redis/v9" @@ -65,6 +66,7 @@ type RedisHeartbeat struct { workerVersion string wasDown bool // true if the last publish failed (used to detect reconnect) stop chan struct{} + stopOnce sync.Once // guards close(stop) + rdb.Del — Stop() may be called from preemption handler and defer } // NewRedisHeartbeat creates a new heartbeat publisher. @@ -218,14 +220,18 @@ func (h *RedisHeartbeat) publish() { } // Stop stops the heartbeat publisher and closes the Redis connection. +// Idempotent — safe to call from both the preemption-handler goroutine and +// the normal `defer hb.Stop()` shutdown path. func (h *RedisHeartbeat) Stop() { - close(h.stop) + h.stopOnce.Do(func() { + close(h.stop) - // Remove the key so the server knows we're gone immediately - ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) - defer cancel() - h.rdb.Del(ctx, "worker:"+h.workerID) + // Remove the key so the server knows we're gone immediately + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + h.rdb.Del(ctx, "worker:"+h.workerID) - h.rdb.Close() - log.Println("redis_heartbeat: stopped") + h.rdb.Close() + log.Println("redis_heartbeat: stopped") + }) } From 79c8836d42cfc9c8c89745cb10dcb121bed4a72a Mon Sep 17 00:00:00 2001 From: motatoes Date: Tue, 19 May 2026 17:02:27 -0700 Subject: [PATCH 02/32] config: pass region explicitly to AWS SDK config loader MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LoadDefaultConfig didn't auto-detect the region on the CP EC2 — the first ListSecrets call returned 'failed to resolve service endpoint: Missing Region'. Read OPENSANDBOX_REGION → AWS_REGION explicitly and pass via awsconfig.WithRegion(). Falls through to the default chain if neither is set. --- internal/config/secretsmanager.go | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/internal/config/secretsmanager.go b/internal/config/secretsmanager.go index a91b512c..71591686 100644 --- a/internal/config/secretsmanager.go +++ b/internal/config/secretsmanager.go @@ -39,7 +39,22 @@ type awsSecretsManagerProvider struct { func (p *awsSecretsManagerProvider) Name() string { return "aws-secretsmanager" } func (p *awsSecretsManagerProvider) Load(ctx context.Context, mode string) (int, int, error) { - cfg, err := awsconfig.LoadDefaultConfig(ctx) + // Region: explicit env var wins (matches the cell config), else AWS_REGION, + // else fall through to the default chain (IMDS on EC2). Passing nothing + // when LoadDefaultConfig can't resolve a region anywhere makes the first + // API call return "Missing Region", which surfaces unclearly — prefer the + // explicit env-var path even on EC2. + region := os.Getenv("OPENSANDBOX_REGION") + if region == "" { + region = os.Getenv("AWS_REGION") + } + + var loadOpts []func(*awsconfig.LoadOptions) error + if region != "" { + loadOpts = append(loadOpts, awsconfig.WithRegion(region)) + } + + cfg, err := awsconfig.LoadDefaultConfig(ctx, loadOpts...) if err != nil { return 0, 0, fmt.Errorf("secretsmanager: load aws config: %w", err) } From 0497f465afb7e6c1ad462223f9a27febcc086f57 Mon Sep 17 00:00:00 2001 From: motatoes Date: Tue, 19 May 2026 17:05:40 -0700 Subject: [PATCH 03/32] packer aws: upload vector dir as tarball to avoid scp -r failure scp doesn't recursively upload directories without -r; Packer's file provisioner doesn't always pass the flag. Switched to the same tar + shell-extract pattern used for the rootfs context. Pre-build: tar czf /tmp/packer-vector-ctx.tar.gz deploy/vector/ --- deploy/packer/worker-ami-aws.pkr.hcl | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/deploy/packer/worker-ami-aws.pkr.hcl b/deploy/packer/worker-ami-aws.pkr.hcl index f00d9fef..9a7b4edf 100644 --- a/deploy/packer/worker-ami-aws.pkr.hcl +++ b/deploy/packer/worker-ami-aws.pkr.hcl @@ -82,6 +82,12 @@ variable "rootfs_context" { description = "Pre-built tarball of rootfs + agent wrapper sources." } +variable "vector_context" { + type = string + default = "/tmp/packer-vector-ctx.tar.gz" + description = "Pre-built tarball of deploy/vector/ (config + populator + units). Pre-create with: tar czf /tmp/packer-vector-ctx.tar.gz deploy/vector/" +} + variable "golden_cache_bucket" { type = string default = "" @@ -174,10 +180,20 @@ build { destination = "/tmp/opensandbox-worker.service" } - # 4. Upload Vector config + populator. + # 4. Upload Vector config + populator. Packer's file provisioner doesn't + # do recursive directory upload reliably across SSH clients, so we + # tar/extract the same way we do the rootfs context above. See + # var.vector_context for the pre-build command. provisioner "file" { - source = "deploy/vector/" - destination = "/tmp/vector/" + source = var.vector_context + destination = "/tmp/vector-ctx.tar.gz" + } + provisioner "shell" { + inline = [ + "mkdir -p /tmp/vector", + "tar xzf /tmp/vector-ctx.tar.gz -C /tmp/vector --strip-components=2", # strip deploy/vector/ prefix + "rm /tmp/vector-ctx.tar.gz", + ] } # 5. Run the (misleadingly-named-but-cloud-agnostic) setup script. Installs From 0a765194e1b26d7c202c7866fe434b3a133fc5d4 Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Mon, 1 Jun 2026 17:46:26 -0700 Subject: [PATCH 04/32] fix rebase conflict resolutions --- cmd/server/main.go | 226 +++++++++++++++++++++++++++++++++++++--- cmd/worker/main.go | 254 +++++++++++++++++++++++++++++++++++++-------- 2 files changed, 422 insertions(+), 58 deletions(-) diff --git a/cmd/server/main.go b/cmd/server/main.go index b1546cc2..6b96175e 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -7,13 +7,14 @@ import ( "log" "log/slog" "os" + "os/signal" "strconv" "strings" - "os/signal" "syscall" - "time" + "github.com/redis/go-redis/v9" + "github.com/opensandbox/opensandbox/internal/api" "github.com/opensandbox/opensandbox/internal/auth" "github.com/opensandbox/opensandbox/internal/billing" @@ -23,6 +24,7 @@ import ( "github.com/opensandbox/opensandbox/internal/controlplane" "github.com/opensandbox/opensandbox/internal/crypto" "github.com/opensandbox/opensandbox/internal/db" + "github.com/opensandbox/opensandbox/internal/edgeclient" "github.com/opensandbox/opensandbox/internal/metrics" "github.com/opensandbox/opensandbox/internal/obslog" "github.com/opensandbox/opensandbox/internal/observability" @@ -35,8 +37,9 @@ import ( var ServerVersion = "dev" func main() { - // Load secrets from the configured cloud secret store (Azure KV or AWS SM) - // before config.Load reads env vars. No-op if neither is configured. + // Load secrets from the configured cloud secret store before config.Load + // reads env vars. No-op if neither Azure Key Vault nor AWS Secrets Manager + // is configured. if err := config.LoadSecrets(); err != nil { log.Fatalf("failed to load secrets: %v", err) } @@ -76,10 +79,14 @@ func main() { // Build server options opts := &api.ServerOpts{ - Mode: cfg.Mode, - WorkerID: cfg.WorkerID, - Region: cfg.Region, - HTTPAddr: cfg.HTTPAddr, + Mode: cfg.Mode, + WorkerID: cfg.WorkerID, + Region: cfg.Region, + HTTPAddr: cfg.HTTPAddr, + CellID: cfg.CellID, + SessionJWTSecret: cfg.SessionJWTSecret, + CFAdminSecret: cfg.CFAdminSecret, + CFEventSecret: cfg.CFEventSecret, } // Initialize PostgreSQL if configured @@ -172,6 +179,29 @@ func main() { if err != nil { log.Fatalf("failed to connect to Redis: %v", err) } + // Reconcile-on-reconnect: when a worker rejoins after being pruned + // for missed heartbeats, run both directions of the cell-vs-worker + // state reconcile. See internal/controlplane/reconcile.go for the + // full rationale on each. Captured opts.Store + cfg.CellID directly; + // the closure reads them at call time. + // + // reverse first: cell-running but worker-doesn't-have-it → + // close on cell side (UpdateSessionStatus + EndScaleEvent + + // publish stopped event). Stops the billing leak immediately. + // + // forward second: cell-stopped but worker-still-hosting → + // re-issue Destroy via RPC. Cleans the worker side. + // + // Reverse runs first because the more urgent dollars-on-fire case is + // the still-open scale event accruing minute-by-minute, not a stray + // qemu the worker still has. + redisRegistry.OnWorkerRejoined(func(workerID string) { + ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second) + defer cancel() + controlplane.ReconcileRunningOnWorker(ctx, redisRegistry, opts.Store, cfg.CellID, workerID) + controlplane.ReconcileStoppedOnWorker(ctx, redisRegistry, opts.Store, workerID) + }) + redisRegistry.Start() defer redisRegistry.Stop() opts.WorkerRegistry = redisRegistry @@ -183,6 +213,48 @@ func main() { opts.SandboxAPIProxy = proxy.NewSandboxAPIProxy(opts.Store, redisRegistry, opts.JWTIssuer) log.Println("opensandbox: sandbox API proxy enabled (data-plane requests proxied to workers)") } + + // CF-parallel event forwarder. Drains events:{cell_id} from Redis and + // POSTs HMAC-signed batches to the events-ingest Worker. Inert when + // CFEventEndpoint is empty — old NATS path keeps running independently. + if cfg.CFEventEndpoint != "" && cfg.CFEventSecret != "" && cfg.CellID != "" { + cfClient := controlplane.NewCFEventClient(cfg.CFEventEndpoint, cfg.CFEventSecret, cfg.CellID) + fwd, err := controlplane.NewEventForwarder(controlplane.EventForwarderConfig{ + Redis: redisRegistry.RedisClient(), + CellID: cfg.CellID, + Client: cfClient, + }) + if err != nil { + log.Fatalf("event_forwarder: %v", err) + } + if err := fwd.Start(context.Background()); err != nil { + log.Fatalf("event_forwarder start: %v", err) + } + defer func() { + stopCtx, stopCancel := context.WithTimeout(context.Background(), 10*time.Second) + defer stopCancel() + _ = fwd.Stop(stopCtx) + }() + log.Printf("opensandbox: CF event forwarder started (endpoint=%s cell=%s)", cfg.CFEventEndpoint, cfg.CellID) + } else if cfg.Mode == "server" { + log.Printf("opensandbox: CF event forwarder NOT started (CFEventEndpoint/Secret/CellID unset)") + } + + // Capacity reporter — periodically pushes a cell_capacity event onto the + // same events:{cell_id} stream the forwarder drains. Feeds the edge's + // pickCell() cascade via D1. Inert when CellID is empty. + if cfg.CellID != "" { + cr, err := controlplane.NewCapacityReporter(controlplane.CapacityReporterConfig{ + Redis: redisRegistry.RedisClient(), + Registry: redisRegistry, + CellID: cfg.CellID, + }) + if err != nil { + log.Fatalf("capacity_reporter: %v", err) + } + cr.Start(context.Background()) + defer cr.Stop() + } } // Hoisted at function scope so the per-sandbox autoscaler (created @@ -194,10 +266,63 @@ func main() { // Initialize compute pool + autoscaler (server mode) if cfg.Mode == "server" && redisRegistry != nil { + // Build the WorkerSpec: cloud-neutral config that the CP supplies to + // whichever pool is selected. The pool combines this with cloud-specific + // cloud-init to launch new workers. + // + // Workers need to reach Postgres/Redis on the CP's private IP, + // not localhost. Replace localhost with the CP's IP if known. + cpIP := os.Getenv("OPENSANDBOX_CONTROLPLANE_IP") + workerDBURL := cfg.DatabaseURL + workerRedisURL := cfg.RedisURL + if cpIP != "" { + workerDBURL = strings.ReplaceAll(workerDBURL, "localhost", cpIP) + workerDBURL = strings.ReplaceAll(workerDBURL, "127.0.0.1", cpIP) + workerRedisURL = strings.ReplaceAll(workerRedisURL, "localhost", cpIP) + workerRedisURL = strings.ReplaceAll(workerRedisURL, "127.0.0.1", cpIP) + } + spec := compute.WorkerSpec{ + CellID: cfg.CellID, + Region: cfg.Region, + DatabaseURL: workerDBURL, + RedisURL: workerRedisURL, + JWTSecret: cfg.JWTSecret, + SessionJWTSecret: cfg.SessionJWTSecret, + CFEventEndpoint: cfg.CFEventEndpoint, + CFEventSecret: cfg.CFEventSecret, + CFAdminSecret: cfg.CFAdminSecret, + MaxCapacity: cfg.MaxCapacity, + SandboxDomain: cfg.SandboxDomain, + DefaultMemoryMB: cfg.DefaultSandboxMemoryMB, + DefaultCPUs: cfg.DefaultSandboxCPUs, + DefaultDiskMB: cfg.DefaultSandboxDiskMB, + S3Bucket: cfg.S3Bucket, + S3Region: cfg.S3Region, + S3Endpoint: cfg.S3Endpoint, + S3AccessKeyID: cfg.S3AccessKeyID, + S3SecretAccessKey: cfg.S3SecretAccessKey, + S3ForcePathStyle: cfg.S3ForcePathStyle, + SegmentWriteKey: cfg.SegmentWriteKey, + SecretsRef: cfg.SecretsARN, + } + + // Provider selection. Explicit cfg.ComputeProvider wins; otherwise we + // autodetect from existing fields for backwards compatibility. + provider := cfg.ComputeProvider + if provider == "" { + switch { + case cfg.AzureSubscriptionID != "" && (cfg.AzureImageID != "" || cfg.AzureKeyVaultName != ""): + provider = "azure" + case cfg.EC2AMI != "" || cfg.EC2SSMParameterName != "": + provider = "aws" + } + } + var pool compute.Pool var poolName string - if cfg.AzureSubscriptionID != "" && (cfg.AzureImageID != "" || cfg.AzureKeyVaultName != "") { + switch provider { + case "azure": // Build worker env template — new VMs get this via cloud-init. // GRPC_ADVERTISE, HTTP_ADDR, and WORKER_ID are patched by cloud-init // with the VM's actual private IP and hostname. @@ -303,7 +428,7 @@ func main() { if err != nil { log.Fatalf("opensandbox: failed to create Azure pool: %v", err) } - // If image not set statically but Key Vault is configured, fetch initial image + azPool.SetWorkerSpec(spec) if cfg.AzureImageID == "" && cfg.AzureKeyVaultName != "" { imgID, version, kvErr := azPool.RefreshAMI(context.Background()) if kvErr != nil { @@ -313,8 +438,8 @@ func main() { } pool = azPool poolName = fmt.Sprintf("Azure (size=%s, image=%s, keyvault=%s)", cfg.AzureVMSize, cfg.AzureImageID, cfg.AzureKeyVaultName) - } else if cfg.EC2AMI != "" || cfg.EC2SSMParameterName != "" { - // AWS EC2 compute pool (AMI from config or dynamically from SSM) + + case "aws": ec2Pool, err := compute.NewEC2Pool(compute.EC2PoolConfig{ Region: cfg.S3Region, AccessKeyID: cfg.S3AccessKeyID, @@ -331,7 +456,7 @@ func main() { if err != nil { log.Fatalf("opensandbox: failed to create EC2 pool: %v", err) } - // If AMI not set statically but SSM is configured, fetch initial AMI from SSM + ec2Pool.SetWorkerSpec(spec) if cfg.EC2AMI == "" && cfg.EC2SSMParameterName != "" { amiID, version, ssmErr := ec2Pool.RefreshAMI(context.Background()) if ssmErr != nil { @@ -341,6 +466,11 @@ func main() { } pool = ec2Pool poolName = fmt.Sprintf("EC2 (ami=%s, type=%s, ssm=%s)", cfg.EC2AMI, cfg.EC2InstanceType, cfg.EC2SSMParameterName) + + case "": + log.Println("opensandbox: no compute provider configured (combined mode, no autoscaling)") + default: + log.Fatalf("opensandbox: unknown compute provider %q (expected azure|aws)", provider) } if pool != nil { @@ -369,6 +499,13 @@ func main() { MaxWorkers: cfg.MaxWorkersPerRegion, IdleReserve: cfg.IdleReserveWorkers, MachineSizes: machineSizes, + // For "migrated" event emit after scaler-driven migrations + // (rolling replace, evacuation) — keeps D1 sandboxes_index + // worker_id in sync with cell-PG truth. Without this, the + // dashboard's "which worker is my sandbox on" view goes stale + // every time the autoscaler shuffles things around. + RedisClient: redisRegistry.RedisClient(), + CellID: cfg.CellID, }) defer scaler.Stop() @@ -413,12 +550,21 @@ func main() { for _, w := range redisRegistry.GetAllWorkers() { liveWorkers[w.ID] = true } - orphaned, err := opts.Store.MarkOrphanedSandboxes(ctx, liveWorkers) + orphans, err := opts.Store.MarkOrphanedSandboxes(ctx, liveWorkers) if err != nil { log.Printf("maintenance: orphan reconciliation error: %v", err) observability.CaptureError(err, "area", "maintenance", "op", "mark_orphaned_sandboxes") - } else if orphaned > 0 { - log.Printf("maintenance: marked %d sandboxes as error (worker lost)", orphaned) + } else if len(orphans) > 0 { + log.Printf("maintenance: marked %d sandboxes as error (worker lost)", len(orphans)) + // Mirror to D1 via the events stream. Without these XADDs, + // sandboxes_index keeps showing the rows as running on the + // dead worker indefinitely — the post-cutover ghost-row + // bug. Best-effort; the next tick will re-emit for any + // row still marked `error` on a dead worker if Redis + // rejected the first attempt. + if redisRegistry.RedisClient() != nil && cfg.CellID != "" { + publishStoppedFromMaintenance(ctx, redisRegistry.RedisClient(), cfg.CellID, orphans) + } } } }) @@ -455,6 +601,15 @@ func main() { // Create API server server := api.NewServer(mgr, ptyMgr, cfg.APIKey, opts) + // Wire the CF api-edge HTTP client. Used by resolveSecretStoreInto + + // resolveTemplate to read from D1 over HMAC instead of local PG once + // migration 041 strips the global tables. Falls back to s.store if + // either CFEdgeBaseURL or CFEventSecret is unset (combined dev mode). + if cfg.CFEdgeBaseURL != "" && cfg.CFEventSecret != "" { + server.SetEdgeClient(edgeclient.New(cfg.CFEdgeBaseURL, cfg.CFEventSecret)) + log.Printf("opensandbox: edge client wired (base=%s)", cfg.CFEdgeBaseURL) + } + // Wire Axiom read-only token for the sandbox session logs API. // Token never leaves this process; the UI proxies its queries through // /api/sandboxes/:id/logs. Empty token disables the endpoint (503). @@ -509,11 +664,40 @@ func main() { workers = redisRegistry } reporter := billing.NewUsageReporter(opts.Store, stripeClient, workers) + // CF billing mode: when this CP is wired into the CF event pipe, the + // CreditAccount DO is authoritative on free-tier balance. Disable + // the local free-tier deduction pass so both sides don't race. + if cfg.CFEventEndpoint != "" { + reporter.SetCFBillingMode(true) + log.Println("opensandbox: usage reporter CF-billing mode ON (free-tier deduction deferred to CreditAccount DO)") + } reporter.Start() defer reporter.Stop() log.Println("opensandbox: usage reporter started (interval=5m)") } + // Halt reconciler — safety net for missed CF halt webhooks. Pulls the + // authoritative halt-list from api-edge every 60s and re-issues halts + // for anything that should be halted but isn't. Inert unless + // OPENSANDBOX_HALT_LIST_URL is set. + if cfg.HaltListURL != "" && cfg.CFEventSecret != "" && server != nil { + reconciler := controlplane.NewHaltReconciler(controlplane.HaltReconcilerConfig{ + CellID: cfg.CellID, + ListURL: cfg.HaltListURL, + Secret: cfg.CFEventSecret, + Halter: server, + }) + if reconciler != nil { + reconciler.Start(ctx) + defer func() { + stopCtx, stopCancel := context.WithTimeout(context.Background(), 5*time.Second) + defer stopCancel() + _ = reconciler.Stop(stopCtx) + }() + log.Printf("opensandbox: halt reconciler started (list_url=%s, period=60s)", cfg.HaltListURL) + } + } + // Phase-2 capacity allocator. Writes outbox rows for unified-mode // pro orgs after each settled bucket. Allocator skips legacy and // free orgs (see ListAllocatorCandidates); rollback is by @@ -629,3 +813,13 @@ func getIntEnv(key string, def int) int { } return def } + +// publishStoppedFromMaintenance emits a `stopped` lifecycle event per +// orphaned sandbox so D1 sandboxes_index mirrors the PG sweep done by +// MarkOrphanedSandboxes. Without these XADDs, the maintenance loop's +// dead-worker cleanup is invisible to the dashboard. +func publishStoppedFromMaintenance(ctx context.Context, rdb *redis.Client, cellID string, orphans []db.OrphanedSandbox) { + for _, o := range orphans { + controlplane.PublishLifecycle(ctx, rdb, cellID, "stopped", o.SandboxID, o.WorkerID, o.OrgID, "worker_lost") + } +} diff --git a/cmd/worker/main.go b/cmd/worker/main.go index 16e28879..1f4c0d04 100644 --- a/cmd/worker/main.go +++ b/cmd/worker/main.go @@ -14,9 +14,13 @@ import ( "syscall" "time" + "github.com/google/uuid" + "github.com/redis/go-redis/v9" + "github.com/opensandbox/opensandbox/internal/analytics" "github.com/opensandbox/opensandbox/internal/auth" "github.com/opensandbox/opensandbox/internal/blobstore" + "github.com/opensandbox/opensandbox/internal/cellevents" "github.com/opensandbox/opensandbox/internal/config" "github.com/opensandbox/opensandbox/internal/db" "github.com/opensandbox/opensandbox/internal/metrics" @@ -41,7 +45,7 @@ var WorkerVersion = "dev" func main() { // Subcommands that don't need config/secrets. Must short-circuit before - // LoadSecrets, which is slow and would fail without cloud credentials. + // LoadSecrets, which is slow and can fail without cloud credentials. // // "golden-version " prints the full-file hash used for golden-image // archive keys. Packer invokes this so the archive key matches what @@ -79,9 +83,11 @@ func main() { return } - // Load secrets from Azure Key Vault if configured (before config.Load reads env vars). + // Load secrets from the configured cloud secret store before config.Load + // reads env vars. No-op if neither Azure Key Vault nor AWS Secrets Manager + // is configured. if err := config.LoadSecrets(); err != nil { - log.Fatalf("failed to load secrets from Key Vault: %v", err) + log.Fatalf("failed to load secrets: %v", err) } cfg, err := config.Load() @@ -126,6 +132,10 @@ func main() { var doGracefulShutdown func(checkpointStore *storage.CheckpointStore, store *db.Store) // Metadata server (set by QEMU backend, wired to store later) var metadataSrv *worker.MetadataServer + // Forward-declared so doGracefulShutdown's closure (built in the QEMU + // init block below) can capture it; the actual NewSandboxDBManager call + // happens after backend init. + var sandboxDBMgr *sandbox.SandboxDBManager // Initialize secrets proxy for MITM token substitution. // Runs on :3128 — VMs route HTTPS through this to keep real secrets off-VM. @@ -313,7 +323,7 @@ func main() { log.Printf("opensandbox-worker: %d VMs failed to hibernate: %v", len(failed), failed) } - processHibernateResults(results, store, checkpointStore, func(r interface{}) (string, string, error) { + processHibernateResults(results, store, checkpointStore, sandboxDBMgr, func(r interface{}) (string, string, error) { hr := r.(qm.HibernateAllResult) return hr.SandboxID, hr.HibernationKey, hr.Err }) @@ -340,8 +350,9 @@ func main() { ptyMgr := sandbox.NewAgentPTYManager(ptySessionFactory) defer ptyMgr.CloseAll() - // Initialize per-sandbox SQLite manager - sandboxDBMgr := sandbox.NewSandboxDBManager(cfg.DataDir) + // Initialize per-sandbox SQLite manager (forward-declared above so the + // graceful-shutdown closure can capture it). + sandboxDBMgr = sandbox.NewSandboxDBManager(cfg.DataDir) defer sandboxDBMgr.Close() // JWT issuer @@ -413,11 +424,21 @@ func main() { defer store.Close() log.Println("opensandbox-worker: PostgreSQL store connected (auto-wake enabled)") - _, stopped, err := store.ReconcileWorkerSessions(ctx, cfg.WorkerID) + hibernated, stopped, err := store.ReconcileWorkerSessions(ctx, cfg.WorkerID) if err != nil { log.Printf("opensandbox-worker: warning: session reconciliation failed: %v", err) - } else if stopped > 0 { - log.Printf("opensandbox-worker: reconciled %d unrecoverable sessions as stopped", stopped) + } else { + if len(stopped) > 0 { + log.Printf("opensandbox-worker: reconciled %d unrecoverable sessions as stopped", len(stopped)) + } + if len(hibernated) > 0 { + log.Printf("opensandbox-worker: reconciled %d sessions as hibernated", len(hibernated)) + } + // Mirror PG state changes to D1 via the cell events stream. + // Without these XADDs, the dashboard keeps the rows at the + // pre-restart state ("running" on a worker that just rebooted). + emitReconcileEvents(ctx, cfg, "hibernated", "worker_restart", hibernated) + emitReconcileEvents(ctx, cfg, "stopped", "worker_restart", stopped) } // Wire up metadata server billing callback @@ -475,6 +496,20 @@ func main() { } _ = store.UpdateSandboxSessionStatus(context.Background(), sandboxID, "hibernated", nil) } + // Emit "hibernated" lifecycle event so events-ingest mirrors the + // status flip to D1 sandboxes_index. Without this, idle-timeout + // hibernations only land in cell PG and D1 keeps showing the + // sandbox as running on a worker that no longer hosts it. + if sandboxDBMgr != nil { + if sdb, dbErr := sandboxDBMgr.Get(sandboxID); dbErr == nil { + _ = sdb.LogEvent("hibernated", map[string]string{ + "sandbox_id": sandboxID, + "checkpoint_key": result.HibernationKey, + "reason": "idle_timeout", + }) + } + _ = sandboxDBMgr.Remove(sandboxID) + } }, OnKill: func(sandboxID string) { log.Printf("opensandbox-worker: sandbox %s killed on timeout", sandboxID) @@ -482,6 +517,17 @@ func main() { if store != nil { _ = store.UpdateSandboxSessionStatus(context.Background(), sandboxID, "stopped", nil) } + // Same fix as OnHibernate above — D1 needs a "stopped" event so + // the dashboard doesn't keep the row at "running" forever. + if sandboxDBMgr != nil { + if sdb, dbErr := sandboxDBMgr.Get(sandboxID); dbErr == nil { + _ = sdb.LogEvent("stopped", map[string]string{ + "sandbox_id": sandboxID, + "reason": "kill_timeout", + }) + } + _ = sandboxDBMgr.Remove(sandboxID) + } }, }) defer sbRouter.Close() @@ -638,58 +684,103 @@ func main() { fixed, err := store.ReconcileWorkerReconnect(context.Background(), cfg.WorkerID, runningIDs) if err != nil { log.Printf("opensandbox-worker: reconnect reconciliation failed: %v", err) - } else if fixed > 0 { - log.Printf("opensandbox-worker: reconnect reconciliation: %d sessions restored to running", fixed) + } else if len(fixed) > 0 { + log.Printf("opensandbox-worker: reconnect reconciliation: %d sessions restored to running", len(fixed)) + // Mirror to D1: emit `running` events so the dashboard + // reflects the recovery. Without these, D1 keeps the + // `error` status from the maintenance loop's previous + // sweep — customer sees their sandbox as broken until + // the next state-changing event. + // + // Bounded context so a redis stall during shutdown can't + // block this callback forever. Budget: 10s per event + // (3s XADD × up to 3 retries) × len(fixed), capped at + // 60s overall. Reconnect storms rarely produce more + // than a handful of fixed rows. + emitCtx, emitCancel := context.WithTimeout(context.Background(), 60*time.Second) + emitReconcileEvents(emitCtx, cfg, "running", "worker_reconnect", fixed) + emitCancel() } }) } defer hb.Stop() log.Println("opensandbox-worker: Redis heartbeat started") - // Spot-preemption monitor. NewMonitor returns a no-op on - // non-cloud deployments — the goroutine still spins but - // never fires. When OPENSANDBOX_CLOUD=aws the AWS monitor - // polls IMDSv2 every 5s for /latest/meta-data/spot/instance-action. - // - // On a Notice we drain in the most minimal way possible for - // the PoC: stop the heartbeat so the CP sees us as gone and - // re-schedules our sandboxes. The hibernate-each-sandbox path - // is the next iteration — for now, sandboxes on a preempted - // host fail and the customer re-creates. This matches the - // "PoC accepts sandbox state loss on reclaim" risk in the plan. preemptMon := preemption.NewMonitor() go func() { notices := preemptMon.Watch(ctx) for notice := range notices { - log.Printf("opensandbox-worker: PREEMPTION notice from %s — action=%s eta=%s, draining now", + log.Printf("opensandbox-worker: PREEMPTION notice from %s - action=%s eta=%s, draining now", preemptMon.Name(), notice.Action, notice.ETA.Format(time.RFC3339)) hb.Stop() - // TODO: hibernate live sandboxes via mgr to S3 within - // the ETA budget before exiting. Until then the - // kernel/systemd terminates us when the cloud reclaims. return } }() } } - // NATS - if cfg.NATSURL != "" { - pub, err := worker.NewEventPublisher(cfg.NATSURL, cfg.Region, cfg.WorkerID, sandboxDBMgr) + // CF-parallel: Redis Streams event publisher. Inert unless CellID is set. + // (The legacy NATS publisher used to run alongside this; it was removed + // once Redis Streams covered all event types end-to-end. NATSURL in the + // env file is ignored.) + if cfg.CellID != "" && cfg.RedisURL != "" { + // Resolver: look up sandbox → org → plan via cell-local PG. Called + // per event during flush; sandbox_sessions has an indexed lookup on + // sandbox_id and orgs is keyed by org_id, so each call is two index + // hits. usage_tick volume is sandboxes × ~30s, so cost is bounded. + // nil store (no PG) leaves the fields blank — events-ingest then + // treats them as "unknown plan" and skips DO debit, which is the + // safe fallback. + var planResolver worker.MetadataResolver + if store != nil { + st := store + planResolver = func(sandboxID string) (string, string, bool) { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + orgIDStr, err := st.GetSandboxOrgID(ctx, sandboxID) + if err != nil || orgIDStr == "" { + return "", "", false + } + orgID, err := uuid.Parse(orgIDStr) + if err != nil { + return "", "", false + } + org, err := st.GetOrg(ctx, orgID) + if err != nil { + // Org row may not exist yet (new app2 user, lazy upsert + // hasn't run from this sandbox's create). Return the + // org_id so events-ingest can still archive; leave plan + // blank so it skips DO debit until the row exists. + return orgIDStr, "", true + } + return orgIDStr, org.Plan, true + } + } + redisPub, err := worker.NewRedisEventPublisher(worker.RedisEventPublisherConfig{ + RedisURL: cfg.RedisURL, + SandboxDBs: sandboxDBMgr, + CellID: cfg.CellID, + WorkerID: cfg.WorkerID, + Resolver: planResolver, + }) if err != nil { - log.Printf("opensandbox-worker: NATS not available: %v (continuing without event sync)", err) + log.Printf("opensandbox-worker: Redis event publisher init failed: %v (continuing)", err) } else { - pub.Start() - if qemuMgr != nil { - pub.SetGoldenVersion(qemuMgr.GoldenVersion()) - } - pub.StartHeartbeat(func() (int, int, float64, float64, float64) { - count, _ := mgr.Count(context.Background()) - cpuPct, memPct, diskPct := worker.SystemStats() - return cfg.MaxCapacity, count, cpuPct, memPct, diskPct + // Wire the publisher as the SandboxDBManager's OnRemove hook so + // terminal events (stopped, hibernated) are synchronously flushed + // to Redis BEFORE the SQLite file is deleted. The destroy / + // hibernate gRPC handlers LogEvent + then call Remove; this hook + // closes the race that previously dropped those events. + sandboxDBMgr.SetOnRemove(func(sandboxID string) { + redisPub.FlushSandbox(context.Background(), sandboxID) }) - defer pub.Stop() - log.Println("opensandbox-worker: NATS event publisher started") + redisPub.Start(context.Background()) + defer func() { + stopCtx, stopCancel := context.WithTimeout(context.Background(), 5*time.Second) + defer stopCancel() + _ = redisPub.Stop(stopCtx) + }() + log.Printf("opensandbox-worker: Redis event publisher started (stream=events:%s)", cfg.CellID) } } @@ -697,6 +788,25 @@ func main() { autosaver := worker.NewWorkspaceAutosaver(mgr, autosaverSyncer, 5*time.Minute) autosaver.Start() + // Usage ticker — drives the free-tier billing loop. Emits a usage_tick + // per running sandbox every 20s; events-ingest fans out to per-org + // CreditAccount DOs, which debit balance + dispatch halt when it + // hits zero. Without this the free-tier balance never decrements. + // Inert unless CellID is set (combined-mode dev without Redis stream + // would write events to /dev/null since there's no consumer). + if cfg.CellID != "" && mgr != nil { + usageTicker := worker.NewUsageTicker(mgr, sandboxDBMgr, 20*time.Second, 10) + if usageTicker != nil { + usageTicker.Start(context.Background()) + defer func() { + stopCtx, stopCancel := context.WithTimeout(context.Background(), 3*time.Second) + defer stopCancel() + _ = usageTicker.Stop(stopCtx) + }() + log.Println("opensandbox-worker: usage ticker started (interval=20s, 10¢/tick)") + } + } + // Segment analytics — ships per-org GB-seconds memory usage. nil if SEGMENT_WRITE_KEY unset. segmentClient := analytics.New(cfg.SegmentWriteKey) if segmentClient != nil { @@ -722,9 +832,25 @@ func main() { OnHibernateIdle: func(sandboxIDs []string) { for _, id := range sandboxIDs { if checkpointStore != nil { - _, err := mgr.Hibernate(context.Background(), id, checkpointStore) + result, err := mgr.Hibernate(context.Background(), id, checkpointStore) if err != nil { log.Printf("pressure-hibernate %s: %v", id, err) + continue + } + // Mirror the per-sandbox SQLite event the gRPC Hibernate + // handler writes — without this, auto-hibernate writes + // the SUCCEED to local PG but never publishes "hibernated" + // to events-ingest, so D1 sandboxes_index drifts to + // "running" while the cell PG says "hibernated". + if sandboxDBMgr != nil { + if sdb, dbErr := sandboxDBMgr.Get(id); dbErr == nil { + _ = sdb.LogEvent("hibernated", map[string]string{ + "sandbox_id": id, + "checkpoint_key": result.HibernationKey, + "reason": "pressure_auto", + }) + } + _ = sandboxDBMgr.Remove(id) } } } @@ -1007,7 +1133,15 @@ func deleteOldHibernation(store *storage.CheckpointStore, key string) { } // processHibernateResults handles results from HibernateAll for both backends. -func processHibernateResults(results interface{}, store *db.Store, checkpointStore *storage.CheckpointStore, extract func(interface{}) (string, string, error)) { +// +// In addition to updating cell-local PG, we LogEvent("hibernated") into the +// per-sandbox SQLite then call sandboxDBs.Remove — the Remove hook flushes +// any unsynced events (including this one) to Redis Streams synchronously. +// events-ingest then mirrors the state to D1, keeping the dashboard list in +// sync. Without this, the bulk-shutdown path silently skipped the lifecycle +// event the gRPC HibernateSandbox handler emits per call, and D1 stayed +// "running" until something else nudged it. +func processHibernateResults(results interface{}, store *db.Store, checkpointStore *storage.CheckpointStore, sandboxDBs *sandbox.SandboxDBManager, extract func(interface{}) (string, string, error)) { switch rs := results.(type) { case []qm.HibernateAllResult: for _, r := range rs { @@ -1017,6 +1151,12 @@ func processHibernateResults(results interface{}, store *db.Store, checkpointSto errMsg := "hibernate failed on shutdown: " + r.Err.Error() _ = store.UpdateSandboxSessionStatus(context.Background(), r.SandboxID, "stopped", &errMsg) } + if sandboxDBs != nil { + if sdb, err := sandboxDBs.Get(r.SandboxID); err == nil { + _ = sdb.LogEvent("stopped", map[string]string{"reason": "hibernate failed on shutdown"}) + } + _ = sandboxDBs.Remove(r.SandboxID) + } continue } log.Printf("opensandbox-worker: hibernated %s (key=%s)", r.SandboxID, r.HibernationKey) @@ -1029,10 +1169,40 @@ func processHibernateResults(results interface{}, store *db.Store, checkpointSto _ = store.UpdateSandboxSessionStatus(context.Background(), r.SandboxID, "hibernated", nil) } } + if sandboxDBs != nil { + if sdb, err := sandboxDBs.Get(r.SandboxID); err == nil { + _ = sdb.LogEvent("hibernated", map[string]string{"key": r.HibernationKey, "reason": "graceful_shutdown"}) + } + _ = sandboxDBs.Remove(r.SandboxID) + } } } } +// emitReconcileEvents XADDs a `cellevents.PublishLifecycle` per orphan to the +// cell's events stream. Used by the worker-startup and worker-reconnect +// reconcilers — both run rarely (boot, network blip) and need a redis client +// just for the emit. Keeping the construction here means one URL parse + one +// pool init + one Close per call site, instead of inlining the same dance +// twice in main(). ctx must be bounded by the caller; the function does not +// add its own timeout (cellevents.PublishLifecycle has a 3s XADD timeout per +// attempt with up to 3 retries, so worst case ~10s per event). +func emitReconcileEvents(ctx context.Context, cfg *config.Config, eventType, reason string, orphans []db.OrphanedSandbox) { + if len(orphans) == 0 || cfg.RedisURL == "" || cfg.CellID == "" { + return + } + opts, err := redis.ParseURL(cfg.RedisURL) + if err != nil { + log.Printf("opensandbox-worker: reconcile emit (%s): redis URL parse failed: %v — events skipped", eventType, err) + return + } + rdb := redis.NewClient(opts) + defer rdb.Close() + for _, o := range orphans { + cellevents.PublishLifecycle(ctx, rdb, cfg.CellID, eventType, o.SandboxID, o.WorkerID, o.OrgID, reason) + } +} + // recoverLocalQEMU handles local disk recovery for QEMU backend. func recoverLocalQEMU(ctx context.Context, qmMgr *qm.Manager, store *db.Store, cfg *config.Config) { recoveries := qmMgr.RecoverLocalSandboxes() From d42fffea80b6f30528005e0097037c06242f993f Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Mon, 1 Jun 2026 21:08:50 -0700 Subject: [PATCH 05/32] Improve worker migration readiness startup --- cmd/worker/main.go | 29 +++- deploy/packer/worker-ami-aws.pkr.hcl | 29 ++++ internal/api/sandbox.go | 11 ++ internal/controlplane/redis_registry.go | 79 +++++++---- internal/controlplane/scaler.go | 41 +++--- internal/controlplane/scaler_test.go | 65 ++++++++- .../controlplane/worker_readiness_test.go | 26 ++++ internal/controlplane/worker_registry.go | 74 +++++++--- internal/qemu/manager.go | 129 +++++++++++++----- internal/worker/redis_heartbeat.go | 122 +++++++++++------ 10 files changed, 456 insertions(+), 149 deletions(-) create mode 100644 internal/controlplane/worker_readiness_test.go diff --git a/cmd/worker/main.go b/cmd/worker/main.go index 1f4c0d04..de2b44a3 100644 --- a/cmd/worker/main.go +++ b/cmd/worker/main.go @@ -267,9 +267,8 @@ func main() { // capacity. See internal/qemu/orphan_reaper.go. qmMgr.StartOrphanReaper(ctx) - // Prepare golden snapshot for fast VM creation - if err := qmMgr.PrepareGoldenSnapshot(); err != nil { - log.Printf("opensandbox-worker: WARNING: golden snapshot failed, using cold boot: %v", err) + if err := qmMgr.LoadGoldenVersionFromImage(); err != nil { + log.Printf("opensandbox-worker: WARNING: base golden version not available yet: %v", err) } mgr = qmMgr @@ -608,6 +607,8 @@ func main() { } }() + var hb *worker.RedisHeartbeat + // Redis heartbeat if cfg.RedisURL != "" { grpcAdvertise := grpcAddr @@ -615,13 +616,16 @@ func main() { grpcAdvertise = addr } - hb, err := worker.NewRedisHeartbeat(cfg.RedisURL, cfg.WorkerID, cfg.Region, grpcAdvertise, cfg.HTTPAddr) + var err error + hb, err = worker.NewRedisHeartbeat(cfg.RedisURL, cfg.WorkerID, cfg.Region, grpcAdvertise, cfg.HTTPAddr) if err != nil { log.Printf("opensandbox-worker: Redis heartbeat not available: %v", err) } else { hb.SetWorkerVersion(WorkerVersion) if qemuMgr != nil { hb.SetGoldenVersion(qemuMgr.GoldenVersion()) + hb.SetAcceptsCreates(false) + hb.SetAcceptsMigrations(qemuMgr.GoldenVersion() != "") } if envID := os.Getenv("OPENSANDBOX_MACHINE_ID"); envID != "" { hb.SetMachineID(envID) @@ -719,6 +723,23 @@ func main() { } } + if qemuMgr != nil { + go func() { + log.Println("opensandbox-worker: preparing golden snapshot in background") + if err := qemuMgr.PrepareGoldenSnapshot(); err != nil { + log.Printf("opensandbox-worker: WARNING: golden snapshot failed, using cold boot: %v", err) + } + if hb != nil { + hb.SetGoldenVersion(qemuMgr.GoldenVersion()) + hb.SetAcceptsCreates(true) + if qemuMgr.GoldenVersion() != "" { + hb.SetAcceptsMigrations(true) + } + } + log.Println("opensandbox-worker: create readiness enabled") + }() + } + // CF-parallel: Redis Streams event publisher. Inert unless CellID is set. // (The legacy NATS publisher used to run alongside this; it was removed // once Redis Streams covered all event types end-to-end. NATSURL in the diff --git a/deploy/packer/worker-ami-aws.pkr.hcl b/deploy/packer/worker-ami-aws.pkr.hcl index 9a7b4edf..04fee27a 100644 --- a/deploy/packer/worker-ami-aws.pkr.hcl +++ b/deploy/packer/worker-ami-aws.pkr.hcl @@ -20,10 +20,15 @@ # # 1. Build binaries for linux/amd64: # CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -ldflags "-X main.WorkerVersion=$(git rev-parse --short HEAD)" \ # -o bin/opensandbox-worker ./cmd/worker/ +# CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -ldflags "-X main.ServerVersion=$(git rev-parse --short HEAD)" \ +# -o bin/opensandbox-server ./cmd/server/ # CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o bin/osb-agent ./cmd/agent/ # # # 2. Build the rootfs context tarball: # tar czf /tmp/packer-rootfs-ctx.tar.gz deploy/firecracker/rootfs/ deploy/ec2/build-rootfs-docker.sh scripts/claude-agent-wrapper/ +# npm --prefix web ci +# npm --prefix web run build +# tar czf /tmp/packer-web-dist.tar.gz web/dist/ # # # 3. Run packer: # packer init deploy/packer/worker-ami-aws.pkr.hcl @@ -71,6 +76,11 @@ variable "worker_binary" { default = "bin/opensandbox-worker" } +variable "server_binary" { + type = string + default = "bin/opensandbox-server" +} + variable "agent_binary" { type = string default = "bin/osb-agent" @@ -88,6 +98,12 @@ variable "vector_context" { description = "Pre-built tarball of deploy/vector/ (config + populator + units). Pre-create with: tar czf /tmp/packer-vector-ctx.tar.gz deploy/vector/" } +variable "web_dist_context" { + type = string + default = "/tmp/packer-web-dist.tar.gz" + description = "Pre-built tarball of web/dist/. Pre-create with: npm --prefix web ci && npm --prefix web run build && tar czf /tmp/packer-web-dist.tar.gz web/dist/" +} + variable "golden_cache_bucket" { type = string default = "" @@ -162,6 +178,10 @@ build { source = var.worker_binary destination = "/tmp/opensandbox-worker" } + provisioner "file" { + source = var.server_binary + destination = "/tmp/opensandbox-server" + } provisioner "file" { source = var.agent_binary destination = "/tmp/osb-agent" @@ -188,6 +208,10 @@ build { source = var.vector_context destination = "/tmp/vector-ctx.tar.gz" } + provisioner "file" { + source = var.web_dist_context + destination = "/tmp/web-dist.tar.gz" + } provisioner "shell" { inline = [ "mkdir -p /tmp/vector", @@ -220,8 +244,13 @@ build { # Install worker + agent binaries. "mv /tmp/opensandbox-worker /usr/local/bin/opensandbox-worker", "chmod +x /usr/local/bin/opensandbox-worker", + "mv /tmp/opensandbox-server /usr/local/bin/opensandbox-server", + "chmod +x /usr/local/bin/opensandbox-server", "mv /tmp/osb-agent /usr/local/bin/osb-agent", "chmod +x /usr/local/bin/osb-agent", + "mkdir -p /usr/local/bin/web", + "tar xzf /tmp/web-dist.tar.gz -C /usr/local/bin/web --strip-components=1", + "rm /tmp/web-dist.tar.gz", # Install systemd unit. "mv /tmp/opensandbox-worker.service /etc/systemd/system/opensandbox-worker.service", diff --git a/internal/api/sandbox.go b/internal/api/sandbox.go index dbd4b400..5b0d09ee 100644 --- a/internal/api/sandbox.go +++ b/internal/api/sandbox.go @@ -1020,6 +1020,14 @@ func (s *Server) migrateSandbox(c echo.Context) error { return c.JSON(http.StatusBadRequest, map[string]string{"error": "sandbox must be running to migrate"}) } + targetWorker := s.workerRegistry.GetWorker(req.TargetWorker) + if targetWorker == nil { + return c.JSON(http.StatusServiceUnavailable, map[string]string{"error": "target worker not found"}) + } + if !targetWorker.AcceptsMigrationRouting() { + return c.JSON(http.StatusServiceUnavailable, map[string]string{"error": "target worker is not accepting migrations"}) + } + // Mark as migrating — blocks exec/proxy routing until migration completes migrationDone := false if s.store != nil { @@ -1509,6 +1517,9 @@ func (s *Server) findScaleMigrationTargets(sourceWorkerID string, requestedMemMB if w.ID == sourceWorkerID { continue } + if !w.AcceptsMigrationRouting() { + continue + } if w.Draining { continue } diff --git a/internal/controlplane/redis_registry.go b/internal/controlplane/redis_registry.go index 745fd765..2b72d187 100644 --- a/internal/controlplane/redis_registry.go +++ b/internal/controlplane/redis_registry.go @@ -52,20 +52,22 @@ const ( // WorkerEntry represents a worker in the Redis-backed registry. type WorkerEntry struct { - ID string `json:"worker_id"` - MachineID string `json:"machine_id,omitempty"` // EC2 instance ID - Region string `json:"region"` - GRPCAddr string `json:"grpc_addr"` - HTTPAddr string `json:"http_addr"` - Capacity int `json:"capacity"` - Current int `json:"current"` - CPUPct float64 `json:"cpu_pct"` - MemPct float64 `json:"mem_pct"` + ID string `json:"worker_id"` + MachineID string `json:"machine_id,omitempty"` // EC2 instance ID + Region string `json:"region"` + GRPCAddr string `json:"grpc_addr"` + HTTPAddr string `json:"http_addr"` + Capacity int `json:"capacity"` + Current int `json:"current"` + CPUPct float64 `json:"cpu_pct"` + MemPct float64 `json:"mem_pct"` DiskPct float64 `json:"disk_pct"` TotalMemoryMB int `json:"total_memory_mb,omitempty"` CommittedMemoryMB int `json:"committed_memory_mb,omitempty"` GoldenVersion string `json:"golden_version,omitempty"` WorkerVersion string `json:"worker_version,omitempty"` + AcceptsCreates bool `json:"accepts_creates,omitempty"` + AcceptsMigrations bool `json:"accepts_migrations,omitempty"` Draining bool `json:"draining,omitempty"` // Per-sandbox stats published by the worker. Bounded by per-host sandbox @@ -88,13 +90,13 @@ type SandboxStats struct { // backed by Redis pub/sub for real-time updates and periodic SCAN for reconciliation. // It also maintains a persistent gRPC connection pool to workers. type RedisWorkerRegistry struct { - rdb *redis.Client - mu sync.RWMutex - workers map[string]*WorkerEntry // in-memory hot cache - conns map[string]*grpc.ClientConn // persistent gRPC connections - clients map[string]pb.SandboxWorkerClient // cached gRPC clients - rrCounter uint64 // round-robin counter for tie-breaking - stop chan struct{} + rdb *redis.Client + mu sync.RWMutex + workers map[string]*WorkerEntry // in-memory hot cache + conns map[string]*grpc.ClientConn // persistent gRPC connections + clients map[string]pb.SandboxWorkerClient // cached gRPC clients + rrCounter uint64 // round-robin counter for tie-breaking + stop chan struct{} // onWorkerRejoined fires when a worker registers — both genuinely new and // after being pruned for missed heartbeats. Used by the reconcile-on- @@ -314,6 +316,8 @@ func (r *RedisWorkerRegistry) reconcileAndPrune() { // handleHeartbeat updates the in-memory worker map and dials gRPC if this is a new worker. func (r *RedisWorkerRegistry) handleHeartbeat(entry WorkerEntry) { + normalizeWorkerEntryReadiness(&entry) + // Read drain state outside the lock — this is a network call and we don't // want to block other registry ops on Redis latency. The drain key is the // cross-CP source of truth; per-CP SetDraining writes it on the admin @@ -337,6 +341,8 @@ func (r *RedisWorkerRegistry) handleHeartbeat(entry WorkerEntry) { existing.DiskPct = entry.DiskPct existing.TotalMemoryMB = entry.TotalMemoryMB existing.CommittedMemoryMB = entry.CommittedMemoryMB + existing.AcceptsCreates = entry.AcceptsCreates + existing.AcceptsMigrations = entry.AcceptsMigrations existing.Draining = drainOverride if entry.GoldenVersion != "" { existing.GoldenVersion = entry.GoldenVersion @@ -362,7 +368,7 @@ func (r *RedisWorkerRegistry) handleHeartbeat(entry WorkerEntry) { // during the unreachable window. See internal/controlplane/reconcile.go. entry.Draining = drainOverride r.workers[entry.ID] = &entry - log.Printf("redis_registry: new worker registered: %s (region=%s, grpc=%s, draining=%v)", entry.ID, entry.Region, entry.GRPCAddr, drainOverride) + log.Printf("redis_registry: new worker registered: %s (region=%s, grpc=%s, draining=%v, acceptsCreates=%v, acceptsMigrations=%v)", entry.ID, entry.Region, entry.GRPCAddr, drainOverride, entry.AcceptsCreates, entry.AcceptsMigrations) if r.onWorkerRejoined != nil { // Fire in a goroutine — reconcile may take a few seconds (DB query // + a DestroySandbox RPC per stale entry) and we don't want @@ -439,6 +445,16 @@ func (r *RedisWorkerRegistry) handleHeartbeat(entry WorkerEntry) { } } +func normalizeWorkerEntryReadiness(w *WorkerEntry) { + // Older workers did not publish these booleans. If both are absent they + // decode as false; preserve legacy behavior by treating them as ready for + // both placement classes. + if !w.AcceptsCreates && !w.AcceptsMigrations { + w.AcceptsCreates = true + w.AcceptsMigrations = true + } +} + // dialWorkerLocked dials a gRPC connection to a worker. Must be called with r.mu held. func (r *RedisWorkerRegistry) dialWorkerLocked(workerID, grpcAddr string) { creds, err := grpctls.ClientCredentials() @@ -582,6 +598,9 @@ func (r *RedisWorkerRegistry) collectEligibleLocked(region string, anyRegion boo if w.Draining { continue } + if !w.AcceptsCreateRouting() { + continue + } if w.CPUPct >= routingHardCapPct || w.MemPct >= routingHardCapPct || w.DiskPct >= routingHardCapPct { continue } @@ -751,20 +770,22 @@ func (r *RedisWorkerRegistry) GetWorkersByRegion(region string) []*WorkerInfo { for _, w := range r.workers { if w.Region == region { result = append(result, &WorkerInfo{ - ID: w.ID, - MachineID: w.MachineID, - Region: w.Region, - GRPCAddr: w.GRPCAddr, - HTTPAddr: w.HTTPAddr, - Capacity: w.Capacity, - Current: w.Current, - CPUPct: w.CPUPct, - MemPct: w.MemPct, - DiskPct: w.DiskPct, + ID: w.ID, + MachineID: w.MachineID, + Region: w.Region, + GRPCAddr: w.GRPCAddr, + HTTPAddr: w.HTTPAddr, + Capacity: w.Capacity, + Current: w.Current, + CPUPct: w.CPUPct, + MemPct: w.MemPct, + DiskPct: w.DiskPct, TotalMemoryMB: w.TotalMemoryMB, CommittedMemoryMB: w.CommittedMemoryMB, - GoldenVersion: w.GoldenVersion, - WorkerVersion: w.WorkerVersion, + GoldenVersion: w.GoldenVersion, + WorkerVersion: w.WorkerVersion, + AcceptsCreates: w.AcceptsCreates, + AcceptsMigrations: w.AcceptsMigrations, }) } } diff --git a/internal/controlplane/scaler.go b/internal/controlplane/scaler.go index ed306b21..29513177 100644 --- a/internal/controlplane/scaler.go +++ b/internal/controlplane/scaler.go @@ -20,9 +20,9 @@ import ( ) const ( - scaleUpThreshold = 0.50 // Scale up when utilization > 50% (gives ~3 min runway for new worker to boot) - scaleDownThreshold = 0.20 // Scale down when utilization < 20% - maxWorkersPerRegion = 10 // Hard cap to prevent runaway launches + scaleUpThreshold = 0.50 // Scale up when utilization > 50% (gives ~3 min runway for new worker to boot) + scaleDownThreshold = 0.20 // Scale down when utilization < 20% + maxWorkersPerRegion = 10 // Hard cap to prevent runaway launches pendingWorkerTTL = 10 * time.Minute // How long to wait for a launched worker to register // Resource-based scaling thresholds (applied per-worker, trigger on ANY worker exceeding) @@ -40,9 +40,9 @@ const ( emergencyCPUThreshold = 95.0 emergencyMemThreshold = 95.0 emergencyDiskThreshold = 90.0 - evacuationBatchSize = 3 // sandboxes to migrate per eval cycle per worker - evacuationCooldown = 60 * time.Second // per-worker cooldown between evacuation batches - drainTimeout = 45 * time.Minute // max time to drain a worker via live migration (allows 30 sandboxes × 10min each in batches of 3) + evacuationBatchSize = 3 // sandboxes to migrate per eval cycle per worker + evacuationCooldown = 60 * time.Second // per-worker cooldown between evacuation batches + drainTimeout = 45 * time.Minute // max time to drain a worker via live migration (allows 30 sandboxes × 10min each in batches of 3) creationFailureThreshold = 3 // consecutive failures before exponential backoff creationBackoffMin = 1 * time.Minute // initial backoff after threshold hit @@ -76,14 +76,14 @@ type OrphanCleaner interface { type ScalerConfig struct { Pool compute.Pool Registry ScalerRegistry - Store *db.Store // for updating session worker_id after migration + Store *db.Store // for updating session worker_id after migration StateStore ScalerStateStore // optional: persists scaler state to Redis (nil = in-memory) WorkerImage string Cooldown time.Duration // minimum time between scale-up actions per region Interval time.Duration // how often to evaluate scaling (0 = default 30s) - MinWorkers int // minimum total workers per region (0 = default 1). Always kept running. - MaxWorkers int // maximum workers per region (0 = default 10). Hard cap to prevent runaway launches. - IdleReserve int // target idle (0 sandbox) workers for burst absorption (0 = default 1). Separate from MinWorkers. + MinWorkers int // minimum total workers per region (0 = default 1). Always kept running. + MaxWorkers int // maximum workers per region (0 = default 10). Hard cap to prevent runaway launches. + IdleReserve int // target idle (0 sandbox) workers for burst absorption (0 = default 1). Separate from MinWorkers. // Event emit for D1 sandboxes_index sync. After a scaler-triggered // migration succeeds (rolling replace, evacuation), XADD a "migrated" @@ -123,17 +123,17 @@ type Scaler struct { image string cooldown time.Duration interval time.Duration - minWorkers int - maxWorkers int - idleReserve int + minWorkers int + maxWorkers int + idleReserve int - rdb *redis.Client - cellID string + rdb *redis.Client + cellID string - mu sync.Mutex // protects stop/cancel - cancel context.CancelFunc - wg sync.WaitGroup - running bool + mu sync.Mutex // protects stop/cancel + cancel context.CancelFunc + wg sync.WaitGroup + running bool machineSizes []string // ranked list of provider-specific sizes for scale-up fallback @@ -750,6 +750,9 @@ func (s *Scaler) findMigrationTarget(region, excludeWorkerID string, requiredMem if w.ID == excludeWorkerID { continue } + if !w.AcceptsMigrationRouting() { + continue + } if s.state.IsDraining(w.MachineID) { continue } diff --git a/internal/controlplane/scaler_test.go b/internal/controlplane/scaler_test.go index 663bed17..5c3eff91 100644 --- a/internal/controlplane/scaler_test.go +++ b/internal/controlplane/scaler_test.go @@ -173,12 +173,12 @@ func (p *mockPool) DestroyMachine(_ context.Context, machineID string) error { return nil } -func (p *mockPool) DrainMachine(_ context.Context, _ string) error { return nil } -func (p *mockPool) StartMachine(_ context.Context, _ string) error { return nil } -func (p *mockPool) StopMachine(_ context.Context, _ string) error { return nil } -func (p *mockPool) HealthCheck(_ context.Context, _ string) error { return nil } -func (p *mockPool) CleanupOrphanedResources(_ context.Context) (int, error) { return 0, nil } -func (p *mockPool) ListMachines(_ context.Context) ([]*compute.Machine, error) { return nil, nil } +func (p *mockPool) DrainMachine(_ context.Context, _ string) error { return nil } +func (p *mockPool) StartMachine(_ context.Context, _ string) error { return nil } +func (p *mockPool) StopMachine(_ context.Context, _ string) error { return nil } +func (p *mockPool) HealthCheck(_ context.Context, _ string) error { return nil } +func (p *mockPool) CleanupOrphanedResources(_ context.Context) (int, error) { return 0, nil } +func (p *mockPool) ListMachines(_ context.Context) ([]*compute.Machine, error) { return nil, nil } func (p *mockPool) SupportedRegions(_ context.Context) ([]string, error) { return []string{"us-east-1"}, nil } @@ -676,6 +676,59 @@ func TestFindMigrationTargetSelectsLeastLoaded(t *testing.T) { } } +func TestFindMigrationTargetSkipsMigrationDisabledWorker(t *testing.T) { + reg := newMockRegistry() + pool := newMockPool() + + reg.addWorker(&WorkerInfo{ + ID: "source", MachineID: "osb-worker-source", Region: "us-east-1", + Capacity: 50, Current: 20, CPUPct: 70, MemPct: 50, DiskPct: 30, + }) + reg.addWorker(&WorkerInfo{ + ID: "disabled", MachineID: "osb-worker-disabled", Region: "us-east-1", + Capacity: 50, Current: 1, CPUPct: 5, MemPct: 5, DiskPct: 5, + AcceptsCreates: true, AcceptsMigrations: false, + }) + reg.addWorker(&WorkerInfo{ + ID: "ready", MachineID: "osb-worker-ready", Region: "us-east-1", + Capacity: 50, Current: 10, CPUPct: 20, MemPct: 20, DiskPct: 20, + AcceptsCreates: true, AcceptsMigrations: true, + }) + + s := newTestScaler(reg, pool) + target := s.findMigrationTarget("us-east-1", "source", 0) + if target == nil { + t.Fatal("expected a migration target") + } + if target.ID != "ready" { + t.Errorf("expected ready worker as target, got %s", target.ID) + } +} + +func TestFindMigrationTargetAllowsCreateDisabledWorker(t *testing.T) { + reg := newMockRegistry() + pool := newMockPool() + + reg.addWorker(&WorkerInfo{ + ID: "source", MachineID: "osb-worker-source", Region: "us-east-1", + Capacity: 50, Current: 20, CPUPct: 70, MemPct: 50, DiskPct: 30, + }) + reg.addWorker(&WorkerInfo{ + ID: "target", MachineID: "osb-worker-target", Region: "us-east-1", + Capacity: 50, Current: 1, CPUPct: 5, MemPct: 5, DiskPct: 5, + AcceptsCreates: false, AcceptsMigrations: true, + }) + + s := newTestScaler(reg, pool) + target := s.findMigrationTarget("us-east-1", "source", 0) + if target == nil { + t.Fatal("expected a migration target") + } + if target.ID != "target" { + t.Errorf("expected target worker as target, got %s", target.ID) + } +} + func TestFindMigrationTargetSkipsPressuredWorkers(t *testing.T) { reg := newMockRegistry() pool := newMockPool() diff --git a/internal/controlplane/worker_readiness_test.go b/internal/controlplane/worker_readiness_test.go new file mode 100644 index 00000000..045bd3a6 --- /dev/null +++ b/internal/controlplane/worker_readiness_test.go @@ -0,0 +1,26 @@ +package controlplane + +import "testing" + +func TestWorkerEntryRoutingReadiness(t *testing.T) { + legacy := &WorkerEntry{} + if !legacy.AcceptsCreateRouting() || !legacy.AcceptsMigrationRouting() { + t.Fatal("legacy worker with no readiness fields should accept both routing classes") + } + + migrationOnly := &WorkerEntry{AcceptsCreates: false, AcceptsMigrations: true} + if migrationOnly.AcceptsCreateRouting() { + t.Fatal("migration-only worker should not accept create routing") + } + if !migrationOnly.AcceptsMigrationRouting() { + t.Fatal("migration-only worker should accept migration routing") + } + + createOnly := &WorkerEntry{AcceptsCreates: true, AcceptsMigrations: false} + if !createOnly.AcceptsCreateRouting() { + t.Fatal("create-only worker should accept create routing") + } + if createOnly.AcceptsMigrationRouting() { + t.Fatal("create-only worker should not accept migration routing") + } +} diff --git a/internal/controlplane/worker_registry.go b/internal/controlplane/worker_registry.go index 7dc9e77f..0e05b9e9 100644 --- a/internal/controlplane/worker_registry.go +++ b/internal/controlplane/worker_registry.go @@ -13,22 +13,24 @@ import ( // WorkerInfo represents a registered worker. type WorkerInfo struct { - ID string `json:"worker_id"` - MachineID string `json:"machine_id,omitempty"` // EC2 instance ID - Region string `json:"region"` - GRPCAddr string `json:"grpc_addr"` - HTTPAddr string `json:"http_addr"` - Capacity int `json:"capacity"` - Current int `json:"current"` - CPUPct float64 `json:"cpu_pct"` - MemPct float64 `json:"mem_pct"` - DiskPct float64 `json:"disk_pct"` - TotalMemoryMB int `json:"total_memory_mb,omitempty"` - CommittedMemoryMB int `json:"committed_memory_mb,omitempty"` - GoldenVersion string `json:"golden_version,omitempty"` - WorkerVersion string `json:"worker_version,omitempty"` - LastSeen time.Time `json:"-"` - MissedBeats int `json:"-"` + ID string `json:"worker_id"` + MachineID string `json:"machine_id,omitempty"` // EC2 instance ID + Region string `json:"region"` + GRPCAddr string `json:"grpc_addr"` + HTTPAddr string `json:"http_addr"` + Capacity int `json:"capacity"` + Current int `json:"current"` + CPUPct float64 `json:"cpu_pct"` + MemPct float64 `json:"mem_pct"` + DiskPct float64 `json:"disk_pct"` + TotalMemoryMB int `json:"total_memory_mb,omitempty"` + CommittedMemoryMB int `json:"committed_memory_mb,omitempty"` + GoldenVersion string `json:"golden_version,omitempty"` + WorkerVersion string `json:"worker_version,omitempty"` + AcceptsCreates bool `json:"accepts_creates,omitempty"` + AcceptsMigrations bool `json:"accepts_migrations,omitempty"` + LastSeen time.Time `json:"-"` + MissedBeats int `json:"-"` } // WorkerRegistry tracks live workers from NATS heartbeats. @@ -125,6 +127,9 @@ func (r *WorkerRegistry) GetLeastLoadedWorker(region string) *WorkerInfo { var best *WorkerInfo bestScore := -1.0 for _, w := range workers { + if !w.AcceptsCreateRouting() { + continue + } remaining := w.Capacity - w.Current if remaining <= 0 { continue @@ -226,6 +231,7 @@ func (r *WorkerRegistry) handleHeartbeat(msg *nats.Msg) { if err := json.Unmarshal(msg.Data, &hb); err != nil { return } + normalizeWorkerInfoReadiness(&hb) r.mu.Lock() defer r.mu.Unlock() @@ -237,6 +243,8 @@ func (r *WorkerRegistry) handleHeartbeat(msg *nats.Msg) { existing.CPUPct = hb.CPUPct existing.MemPct = hb.MemPct existing.DiskPct = hb.DiskPct + existing.AcceptsCreates = hb.AcceptsCreates + existing.AcceptsMigrations = hb.AcceptsMigrations if hb.GoldenVersion != "" { existing.GoldenVersion = hb.GoldenVersion } @@ -261,6 +269,40 @@ func (r *WorkerRegistry) handleHeartbeat(msg *nats.Msg) { } } +func normalizeWorkerInfoReadiness(w *WorkerInfo) { + // Older workers did not publish these booleans. If both are absent they + // decode as false; preserve legacy behavior by treating them as ready for + // both placement classes. + if !w.AcceptsCreates && !w.AcceptsMigrations { + w.AcceptsCreates = true + w.AcceptsMigrations = true + } +} + +// AcceptsCreateRouting returns whether this worker should receive new sandbox +// creates. Both fields false is treated as legacy/unknown and therefore ready. +func (w *WorkerInfo) AcceptsCreateRouting() bool { + return w.AcceptsCreates || (!w.AcceptsCreates && !w.AcceptsMigrations) +} + +// AcceptsMigrationRouting returns whether this worker should receive incoming +// live migrations. Both fields false is treated as legacy/unknown and ready. +func (w *WorkerInfo) AcceptsMigrationRouting() bool { + return w.AcceptsMigrations || (!w.AcceptsCreates && !w.AcceptsMigrations) +} + +// AcceptsCreateRouting returns whether this worker should receive new sandbox +// creates. Both fields false is treated as legacy/unknown and therefore ready. +func (w *WorkerEntry) AcceptsCreateRouting() bool { + return w.AcceptsCreates || (!w.AcceptsCreates && !w.AcceptsMigrations) +} + +// AcceptsMigrationRouting returns whether this worker should receive incoming +// live migrations. Both fields false is treated as legacy/unknown and ready. +func (w *WorkerEntry) AcceptsMigrationRouting() bool { + return w.AcceptsMigrations || (!w.AcceptsCreates && !w.AcceptsMigrations) +} + func (r *WorkerRegistry) checkStaleWorkers() { r.mu.Lock() defer r.mu.Unlock() diff --git a/internal/qemu/manager.go b/internal/qemu/manager.go index 7c63aa4a..9a117cbe 100644 --- a/internal/qemu/manager.go +++ b/internal/qemu/manager.go @@ -454,9 +454,69 @@ func (m *Manager) SetHibernationUploadCallback(cb func(sandboxID, hibernationKey // GoldenVersion returns the hash identifying this worker's golden snapshot base image. // Empty string means no golden snapshot is available. func (m *Manager) GoldenVersion() string { + m.mu.RLock() + defer m.mu.RUnlock() return m.goldenVersion } +// LoadGoldenVersionFromImage seeds GoldenVersion from the baked image metadata. +// It is intentionally lightweight so the worker can advertise its base version +// before the runtime golden memory snapshot has finished preparing. +func (m *Manager) LoadGoldenVersionFromImage() error { + baseImage, err := ResolveBaseImage(m.cfg.ImagesDir, "default") + if err != nil { + return fmt.Errorf("resolve base image: %w", err) + } + v, err := m.computeBaseGoldenVersion(baseImage) + if err != nil { + return err + } + m.mu.Lock() + m.goldenVersion = v + m.mu.Unlock() + log.Printf("qemu: loaded base golden version %s", v) + return nil +} + +func (m *Manager) computeBaseGoldenVersion(baseImage string) (string, error) { + versionPath := filepath.Join(m.cfg.ImagesDir, "golden-version") + if b, err := os.ReadFile(versionPath); err == nil { + if v := strings.TrimSpace(string(b)); v != "" { + return v, nil + } + } + return computeGoldenVersion(baseImage) +} + +func (m *Manager) setGoldenSnapshot(dir, version string, cid uint32, guestIP, hostIP string) { + m.mu.Lock() + defer m.mu.Unlock() + m.goldenDir = dir + m.goldenVersion = version + m.goldenCID = cid + m.goldenGuestIP = guestIP + m.goldenHostIP = hostIP +} + +func (m *Manager) goldenSnapshot() (dir, version string) { + m.mu.RLock() + defer m.mu.RUnlock() + return m.goldenDir, m.goldenVersion +} + +func (m *Manager) setGoldenDir(dir string) { + m.mu.Lock() + defer m.mu.Unlock() + m.goldenDir = dir +} + +func (m *Manager) restoreGoldenSnapshot(dir, version string) { + m.mu.Lock() + defer m.mu.Unlock() + m.goldenDir = dir + m.goldenVersion = version +} + // MemoryAllocatedBytes returns the sum of memory committed to currently-running // sandboxes, in bytes. Used by the worker's resource-stats tick to report // oversubscription independent of actual guest workload. @@ -728,32 +788,34 @@ func (m *Manager) PrepareGoldenSnapshot() error { stale := false baseImage, _ := ResolveBaseImage(m.cfg.ImagesDir, "default") if baseImage != "" && storedVersion != "" { - if currentHash, err := computeGoldenVersion(baseImage); err == nil && currentHash != storedVersion { + if currentHash, err := m.computeBaseGoldenVersion(baseImage); err == nil && currentHash != storedVersion { log.Printf("qemu: base image changed (golden=%s, disk=%s), rebuilding golden snapshot", storedVersion, currentHash) stale = true } } if !stale { - m.goldenDir = goldenDir - m.goldenVersion = storedVersion + goldenVersion := storedVersion + var goldenCID uint32 + var goldenGuestIP, goldenHostIP string if cidBytes, err := os.ReadFile(filepath.Join(goldenDir, "cid")); err == nil { - fmt.Sscanf(string(cidBytes), "%d", &m.goldenCID) + fmt.Sscanf(string(cidBytes), "%d", &goldenCID) } if ipBytes, err := os.ReadFile(filepath.Join(goldenDir, "guest_ip")); err == nil { - m.goldenGuestIP = string(ipBytes) + goldenGuestIP = string(ipBytes) } if ipBytes, err := os.ReadFile(filepath.Join(goldenDir, "host_ip")); err == nil { - m.goldenHostIP = string(ipBytes) + goldenHostIP = string(ipBytes) } if storedVersion == "" && baseImage != "" { - if v, err := computeGoldenVersion(baseImage); err == nil { - m.goldenVersion = v + if v, err := m.computeBaseGoldenVersion(baseImage); err == nil { + goldenVersion = v _ = os.WriteFile(versionFile, []byte(v), 0644) } } - log.Printf("qemu: golden snapshot already exists at %s (CID=%d, guestIP=%s, version=%s)", goldenDir, m.goldenCID, m.goldenGuestIP, m.goldenVersion) - go m.uploadBaseImageIfNew(m.goldenVersion) + m.setGoldenSnapshot(goldenDir, goldenVersion, goldenCID, goldenGuestIP, goldenHostIP) + log.Printf("qemu: golden snapshot already exists at %s (CID=%d, guestIP=%s, version=%s)", goldenDir, goldenCID, goldenGuestIP, goldenVersion) + go m.uploadBaseImageIfNew(goldenVersion) return nil } @@ -967,24 +1029,22 @@ func (m *Manager) PrepareGoldenSnapshot() error { } // Compute and persist golden version hash - if v, err := computeGoldenVersion(baseImage); err == nil { - m.goldenVersion = v + goldenVersion := m.GoldenVersion() + if v, err := m.computeBaseGoldenVersion(baseImage); err == nil { + goldenVersion = v _ = os.WriteFile(filepath.Join(goldenDir, "version"), []byte(v), 0644) } // Remove preparing marker — golden snapshot is complete os.Remove(preparingMarker) - m.goldenDir = goldenDir - m.goldenCID = goldenCID - m.goldenGuestIP = netCfg.GuestIP - m.goldenHostIP = netCfg.HostIP + m.setGoldenSnapshot(goldenDir, goldenVersion, goldenCID, netCfg.GuestIP, netCfg.HostIP) _ = os.WriteFile(filepath.Join(goldenDir, "cid"), []byte(fmt.Sprintf("%d", goldenCID)), 0644) _ = os.WriteFile(filepath.Join(goldenDir, "guest_ip"), []byte(netCfg.GuestIP), 0644) _ = os.WriteFile(filepath.Join(goldenDir, "host_ip"), []byte(netCfg.HostIP), 0644) log.Printf("qemu: golden snapshot ready (%dms total, mem=%s, CID=%d, guestIP=%s, version=%s)", - time.Since(t0).Milliseconds(), memFile, goldenCID, netCfg.GuestIP, m.goldenVersion) - go m.uploadBaseImageIfNew(m.goldenVersion) + time.Since(t0).Milliseconds(), memFile, goldenCID, netCfg.GuestIP, goldenVersion) + go m.uploadBaseImageIfNew(goldenVersion) return nil } @@ -993,7 +1053,7 @@ func (m *Manager) PrepareGoldenSnapshot() error { // independent reflink copies — only new sandboxes use the new golden. // Returns the old and new golden version strings. func (m *Manager) RebuildGoldenSnapshot() (oldVersion, newVersion string, err error) { - oldVersion = m.goldenVersion + oldGoldenDir, oldVersion := m.goldenSnapshot() goldenDir := filepath.Join(m.cfg.DataDir, "golden") // Build new golden in a staging directory @@ -1001,14 +1061,13 @@ func (m *Manager) RebuildGoldenSnapshot() (oldVersion, newVersion string, err er os.RemoveAll(stagingDir) // clean up any prior failed attempt // Temporarily point goldenDir to staging so PrepareGoldenSnapshot builds there - oldGoldenDir := m.goldenDir - m.goldenDir = "" + m.setGoldenDir("") // Rename current golden out of the way so PrepareGoldenSnapshot sees no existing snapshot backupDir := filepath.Join(m.cfg.DataDir, "golden-old") os.RemoveAll(backupDir) if err := os.Rename(goldenDir, backupDir); err != nil && !os.IsNotExist(err) { - m.goldenDir = oldGoldenDir + m.setGoldenDir(oldGoldenDir) return oldVersion, "", fmt.Errorf("backup old golden: %w", err) } @@ -1017,13 +1076,12 @@ func (m *Manager) RebuildGoldenSnapshot() (oldVersion, newVersion string, err er // Restore old golden on failure os.RemoveAll(goldenDir) if backupErr := os.Rename(backupDir, goldenDir); backupErr == nil { - m.goldenDir = oldGoldenDir - m.goldenVersion = oldVersion + m.restoreGoldenSnapshot(oldGoldenDir, oldVersion) } return oldVersion, "", fmt.Errorf("rebuild golden: %w", err) } - newVersion = m.goldenVersion + newVersion = m.GoldenVersion() // Clean up old golden — sandboxes created from it have independent reflink copies os.RemoveAll(backupDir) @@ -1037,6 +1095,10 @@ func (m *Manager) RebuildGoldenSnapshot() (oldVersion, newVersion string, err er // After restore, we patch the network config inside the guest. func (m *Manager) createFromGolden(ctx context.Context, cfg types.SandboxConfig, id string) (*types.Sandbox, error) { t0 := time.Now() + goldenDir, goldenVersion := m.goldenSnapshot() + if goldenDir == "" { + return nil, fmt.Errorf("golden snapshot not ready") + } template := cfg.Template if template == "" || template == "base" { @@ -1050,7 +1112,7 @@ func (m *Manager) createFromGolden(ctx context.Context, cfg types.SandboxConfig, // Copy golden rootfs as qcow2 overlay (golden snapshot was taken with qcow2 drives) rootfsPath := filepath.Join(sandboxDir, "rootfs.qcow2") - goldenRootfs := filepath.Join(m.goldenDir, "rootfs.qcow2") + goldenRootfs := filepath.Join(goldenDir, "rootfs.qcow2") if err := copyFileReflink(goldenRootfs, rootfsPath); err != nil { os.RemoveAll(sandboxDir) return nil, fmt.Errorf("copy golden rootfs: %w", err) @@ -1062,7 +1124,7 @@ func (m *Manager) createFromGolden(ctx context.Context, cfg types.SandboxConfig, workspacePath := filepath.Join(sandboxDir, "workspace.qcow2") diskMB := m.cfg.DefaultDiskMB var goldenWSUUID string - if data, readErr := os.ReadFile(filepath.Join(m.goldenDir, "workspace_uuid")); readErr == nil { + if data, readErr := os.ReadFile(filepath.Join(goldenDir, "workspace_uuid")); readErr == nil { goldenWSUUID = strings.TrimSpace(string(data)) } if err := CreateWorkspace(workspacePath, diskMB, goldenWSUUID); err != nil { @@ -1159,8 +1221,8 @@ func (m *Manager) createFromGolden(ctx context.Context, cfg types.SandboxConfig, // Build QEMU args with -incoming to restore from golden snapshot. // Use zstd-compressed mem file if available (less EBS I/O despite CPU cost). - goldenMemZst := filepath.Join(m.goldenDir, "mem.zst") - goldenMemRaw := filepath.Join(m.goldenDir, "mem") + goldenMemZst := filepath.Join(goldenDir, "mem.zst") + goldenMemRaw := filepath.Join(goldenDir, "mem") var incomingURI string if fileExists(goldenMemZst) { incomingURI = fmt.Sprintf("exec:zstdcat %s", goldenMemZst) @@ -1240,7 +1302,7 @@ func (m *Manager) createFromGolden(ctx context.Context, cfg types.SandboxConfig, guestMAC: guestMAC, guestCID: guestCID, bootArgs: bootArgs, - goldenVersion: m.goldenVersion, + goldenVersion: goldenVersion, } // Connect to agent via Unix socket @@ -1601,7 +1663,8 @@ func (m *Manager) Create(ctx context.Context, cfg types.SandboxConfig) (sb *type } // Fast path: restore from golden snapshot if available and using default template - if m.goldenDir != "" && template == "default" && cfg.TemplateRootfsKey == "" { + goldenDir, _ := m.goldenSnapshot() + if goldenDir != "" && template == "default" && cfg.TemplateRootfsKey == "" { sb, err := m.createFromGolden(ctx, cfg, id) if err != nil { log.Printf("qemu: golden restore failed for %s, falling back to cold boot: %v", id, err) @@ -1774,7 +1837,7 @@ func (m *Manager) Create(ctx context.Context, cfg types.SandboxConfig) (sb *type guestMAC: guestMAC, guestCID: guestCID, bootArgs: bootArgs, - goldenVersion: m.goldenVersion, // set even on cold boot — VM uses the same base image + goldenVersion: m.GoldenVersion(), // set even on cold boot — VM uses the same base image } // Wait for agent via Unix socket @@ -3737,7 +3800,7 @@ func (m *Manager) ForkFromCheckpoint(ctx context.Context, checkpointID string, c guestCID: guestCID, bootArgs: bootArgs, agent: agent, - goldenVersion: m.goldenVersion, // set on wake — VM uses the current base image + goldenVersion: m.GoldenVersion(), // set on wake — VM uses the current base image } m.mu.Lock() diff --git a/internal/worker/redis_heartbeat.go b/internal/worker/redis_heartbeat.go index 82c22d60..e48b4de2 100644 --- a/internal/worker/redis_heartbeat.go +++ b/internal/worker/redis_heartbeat.go @@ -13,20 +13,22 @@ import ( // redisHeartbeatPayload is the JSON structure published to Redis. type redisHeartbeatPayload struct { - WorkerID string `json:"worker_id"` - MachineID string `json:"machine_id,omitempty"` // EC2 instance ID (e.g. i-099088f8ac4a34ef3) - Region string `json:"region"` - GRPCAddr string `json:"grpc_addr"` - HTTPAddr string `json:"http_addr"` - Capacity int `json:"capacity"` - Current int `json:"current"` - CPUPct float64 `json:"cpu_pct"` - MemPct float64 `json:"mem_pct"` + WorkerID string `json:"worker_id"` + MachineID string `json:"machine_id,omitempty"` // EC2 instance ID (e.g. i-099088f8ac4a34ef3) + Region string `json:"region"` + GRPCAddr string `json:"grpc_addr"` + HTTPAddr string `json:"http_addr"` + Capacity int `json:"capacity"` + Current int `json:"current"` + CPUPct float64 `json:"cpu_pct"` + MemPct float64 `json:"mem_pct"` DiskPct float64 `json:"disk_pct"` TotalMemoryMB int `json:"total_memory_mb,omitempty"` CommittedMemoryMB int `json:"committed_memory_mb,omitempty"` GoldenVersion string `json:"golden_version,omitempty"` WorkerVersion string `json:"worker_version,omitempty"` + AcceptsCreates bool `json:"accepts_creates,omitempty"` + AcceptsMigrations bool `json:"accepts_migrations,omitempty"` // Per-sandbox stats snapshot. Populated by the worker's stats collector // (see internal/qemu/stats_collector.go) and consumed by the CP autoscaler @@ -52,21 +54,24 @@ type SandboxStatsWire struct { // 1. SETs worker:{id} with a 30s TTL (auto-expires if worker dies) // 2. PUBLISHes to workers:heartbeat for real-time server notification type RedisHeartbeat struct { - rdb *redis.Client - workerID string - machineID string - region string - grpcAddr string - httpAddr string - getStats func() (capacity, current int, cpuPct, memPct, diskPct float64) - getMemoryInfo func() (totalMB, committedMB int) // optional: committed memory for dynamic capacity - getSandboxStats func() map[string]SandboxStatsWire // optional: per-sandbox stats for autoscaler - onReconnect func() // called when heartbeat succeeds after a previous failure - goldenVersion string - workerVersion string - wasDown bool // true if the last publish failed (used to detect reconnect) - stop chan struct{} - stopOnce sync.Once // guards close(stop) + rdb.Del — Stop() may be called from preemption handler and defer + rdb *redis.Client + workerID string + machineID string + region string + grpcAddr string + httpAddr string + getStats func() (capacity, current int, cpuPct, memPct, diskPct float64) + getMemoryInfo func() (totalMB, committedMB int) // optional: committed memory for dynamic capacity + getSandboxStats func() map[string]SandboxStatsWire // optional: per-sandbox stats for autoscaler + onReconnect func() // called when heartbeat succeeds after a previous failure + stateMu sync.RWMutex + goldenVersion string + workerVersion string + acceptsCreates bool + acceptsMigrations bool + wasDown bool // true if the last publish failed (used to detect reconnect) + stop chan struct{} + stopOnce sync.Once // guards close(stop) + rdb.Del — Stop() may be called from preemption handler and defer } // NewRedisHeartbeat creates a new heartbeat publisher. @@ -92,30 +97,54 @@ func NewRedisHeartbeat(redisURL, workerID, region, grpcAddr, httpAddr string) (* } return &RedisHeartbeat{ - rdb: rdb, - workerID: workerID, - region: region, - grpcAddr: grpcAddr, - httpAddr: httpAddr, - stop: make(chan struct{}), + rdb: rdb, + workerID: workerID, + region: region, + grpcAddr: grpcAddr, + httpAddr: httpAddr, + acceptsCreates: true, + acceptsMigrations: true, + stop: make(chan struct{}), }, nil } // SetMachineID sets the EC2 instance ID for the heartbeat (used by scaler for drain/terminate). func (h *RedisHeartbeat) SetMachineID(id string) { + h.stateMu.Lock() + defer h.stateMu.Unlock() h.machineID = id } // SetGoldenVersion sets the golden snapshot version hash for the heartbeat. func (h *RedisHeartbeat) SetGoldenVersion(v string) { + h.stateMu.Lock() + defer h.stateMu.Unlock() h.goldenVersion = v } // SetWorkerVersion sets the worker binary version (git SHA) for the heartbeat. func (h *RedisHeartbeat) SetWorkerVersion(v string) { + h.stateMu.Lock() + defer h.stateMu.Unlock() h.workerVersion = v } +// SetAcceptsCreates controls whether the control plane may route new sandbox +// creates to this worker. +func (h *RedisHeartbeat) SetAcceptsCreates(v bool) { + h.stateMu.Lock() + defer h.stateMu.Unlock() + h.acceptsCreates = v +} + +// SetAcceptsMigrations controls whether the control plane may choose this +// worker as an incoming live-migration target. +func (h *RedisHeartbeat) SetAcceptsMigrations(v bool) { + h.stateMu.Lock() + defer h.stateMu.Unlock() + h.acceptsMigrations = v +} + // SetMemoryInfoFunc sets a callback that returns host total and committed memory in MB. // Used for dynamic capacity reporting. func (h *RedisHeartbeat) SetMemoryInfoFunc(fn func() (totalMB, committedMB int)) { @@ -159,20 +188,29 @@ func (h *RedisHeartbeat) Start(getStats func() (int, int, float64, float64, floa func (h *RedisHeartbeat) publish() { capacity, current, cpuPct, memPct, diskPct := h.getStats() + h.stateMu.RLock() + machineID := h.machineID + goldenVersion := h.goldenVersion + workerVersion := h.workerVersion + acceptsCreates := h.acceptsCreates + acceptsMigrations := h.acceptsMigrations + h.stateMu.RUnlock() payload := redisHeartbeatPayload{ - WorkerID: h.workerID, - MachineID: h.machineID, - Region: h.region, - GRPCAddr: h.grpcAddr, - HTTPAddr: h.httpAddr, - Capacity: capacity, - Current: current, - CPUPct: cpuPct, - MemPct: memPct, - DiskPct: diskPct, - GoldenVersion: h.goldenVersion, - WorkerVersion: h.workerVersion, + WorkerID: h.workerID, + MachineID: machineID, + Region: h.region, + GRPCAddr: h.grpcAddr, + HTTPAddr: h.httpAddr, + Capacity: capacity, + Current: current, + CPUPct: cpuPct, + MemPct: memPct, + DiskPct: diskPct, + GoldenVersion: goldenVersion, + WorkerVersion: workerVersion, + AcceptsCreates: acceptsCreates, + AcceptsMigrations: acceptsMigrations, } // Add committed memory info for dynamic capacity From aa9d98e57e4674f9a95c8fe00943cc6e2d0ae12d Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Tue, 2 Jun 2026 11:32:45 -0700 Subject: [PATCH 06/32] Add admin worker evacuation drill hook --- cmd/server/main.go | 16 +- internal/api/admin_workers.go | 27 ++++ internal/api/router.go | 108 +++++++------- internal/controlplane/scaler.go | 39 ++++- .../controlplane/spot_evacuation_sim_test.go | 141 ++++++++++++++++++ 5 files changed, 278 insertions(+), 53 deletions(-) create mode 100644 internal/controlplane/spot_evacuation_sim_test.go diff --git a/cmd/server/main.go b/cmd/server/main.go index 6b96175e..093d6a14 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -26,8 +26,8 @@ import ( "github.com/opensandbox/opensandbox/internal/db" "github.com/opensandbox/opensandbox/internal/edgeclient" "github.com/opensandbox/opensandbox/internal/metrics" - "github.com/opensandbox/opensandbox/internal/obslog" "github.com/opensandbox/opensandbox/internal/observability" + "github.com/opensandbox/opensandbox/internal/obslog" "github.com/opensandbox/opensandbox/internal/proxy" "github.com/opensandbox/opensandbox/internal/sandbox" "github.com/opensandbox/opensandbox/internal/storage" @@ -507,6 +507,7 @@ func main() { RedisClient: redisRegistry.RedisClient(), CellID: cfg.CellID, }) + opts.WorkerEvacuator = scaler defer scaler.Stop() // Leader election: only the leader runs the scaler. The @@ -526,6 +527,19 @@ func main() { defer leaderElector.Stop() log.Printf("opensandbox: leader election started (instance=%s)", leaderElector.InstanceID()) } + + if opts.WorkerEvacuator == nil && redisRegistry != nil { + scalerState := controlplane.NewRedisScalerState(redisRegistry.RedisClient()) + opts.WorkerEvacuator = controlplane.NewScaler(controlplane.ScalerConfig{ + Registry: redisRegistry, + Store: opts.Store, + StateStore: scalerState, + MaxWorkers: cfg.MaxWorkersPerRegion, + RedisClient: redisRegistry.RedisClient(), + CellID: cfg.CellID, + }) + log.Printf("opensandbox: admin worker evacuator configured without compute pool") + } } // Background maintenance tasks diff --git a/internal/api/admin_workers.go b/internal/api/admin_workers.go index 2dd5e206..841b61f8 100644 --- a/internal/api/admin_workers.go +++ b/internal/api/admin_workers.go @@ -49,3 +49,30 @@ func (s *Server) adminSetWorkerDraining(c echo.Context) error { "draining": drain, }) } + +// adminEvacuateWorker starts the scaler's live-migration drain loop for a +// worker. Unlike scaler scale-down, this does not terminate the machine after +// it becomes empty; it is an operator/test hook for spot evacuation drills. +// +// POST /admin/workers/:id/evacuate +func (s *Server) adminEvacuateWorker(c echo.Context) error { + if s.workerEvacuator == nil { + return c.JSON(http.StatusServiceUnavailable, map[string]string{ + "error": "worker evacuator not configured", + }) + } + + workerID := c.Param("id") + if workerID == "" { + return c.JSON(http.StatusBadRequest, map[string]string{"error": "worker id required"}) + } + + if err := s.workerEvacuator.EvacuateWorker(c.Request().Context(), workerID); err != nil { + return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()}) + } + + return c.JSON(http.StatusAccepted, map[string]any{ + "workerID": workerID, + "evacuating": true, + }) +} diff --git a/internal/api/router.go b/internal/api/router.go index c492d1ea..2adca624 100644 --- a/internal/api/router.go +++ b/internal/api/router.go @@ -35,35 +35,40 @@ var errSandboxNotAvailable = map[string]string{ "error": "sandbox execution not available in server-only mode", } +type workerEvacuator interface { + EvacuateWorker(ctx context.Context, workerID string) error +} + // Server holds the API server dependencies. type Server struct { - echo *echo.Echo - manager sandbox.Manager - router *sandbox.SandboxRouter // routes all sandbox interactions (state machine, auto-wake, rolling timeout) - ptyManager *sandbox.PTYManager - store *db.Store // nil in combined/dev mode without PG - jwtIssuer *auth.JWTIssuer // nil if JWT not configured - capTokenIssuer *auth.JWTIssuer // verifies edge→CP capability tokens; nil if SESSION_JWT_SECRET unset - cfAdminSecret string // HMAC shared with CreditAccount DO for /admin/halt-org and /admin/resume-org; empty disables auth (dev only) - cfEventSecret string // HMAC shared with the api-edge Worker for /internal/secret-refresh and other edge-→cell push paths - cellID string // this control plane's cell_id (for the cap-token cell check) - mode string // "server", "worker", "combined" - workerID string // this worker's ID - region string // this worker's region - httpAddr string // public HTTP address for direct access - execSessionManager *sandbox.ExecSessionManager // nil if not configured - sandboxDBs *sandbox.SandboxDBManager // per-sandbox SQLite manager - workos *auth.WorkOSMiddleware // nil if WorkOS not configured - workerRegistry *controlplane.RedisWorkerRegistry // nil in combined/worker mode - checkpointStore *storage.CheckpointStore // nil if hibernation not configured - sandboxDomain string // base domain for sandbox subdomains - cfClient *cloudflare.Client // nil if Cloudflare not configured - pendingCreates sync.Map // map[sandboxID]*pendingCreate — async sandbox creation tracking - sandboxAPIProxy *proxy.SandboxAPIProxy // nil except in server mode (proxies data-plane to workers) - stripeClient *billing.StripeClient // nil if Stripe not configured - redisClient *redis.Client // nil if Redis not configured (for health checks) - adminEvents *AdminEventBus // real-time event bus for admin dashboard - ready int32 // atomic: 1 = ready, 0 = not ready + echo *echo.Echo + manager sandbox.Manager + router *sandbox.SandboxRouter // routes all sandbox interactions (state machine, auto-wake, rolling timeout) + ptyManager *sandbox.PTYManager + store *db.Store // nil in combined/dev mode without PG + jwtIssuer *auth.JWTIssuer // nil if JWT not configured + capTokenIssuer *auth.JWTIssuer // verifies edge→CP capability tokens; nil if SESSION_JWT_SECRET unset + cfAdminSecret string // HMAC shared with CreditAccount DO for /admin/halt-org and /admin/resume-org; empty disables auth (dev only) + cfEventSecret string // HMAC shared with the api-edge Worker for /internal/secret-refresh and other edge-→cell push paths + cellID string // this control plane's cell_id (for the cap-token cell check) + mode string // "server", "worker", "combined" + workerID string // this worker's ID + region string // this worker's region + httpAddr string // public HTTP address for direct access + execSessionManager *sandbox.ExecSessionManager // nil if not configured + sandboxDBs *sandbox.SandboxDBManager // per-sandbox SQLite manager + workos *auth.WorkOSMiddleware // nil if WorkOS not configured + workerRegistry *controlplane.RedisWorkerRegistry // nil in combined/worker mode + workerEvacuator workerEvacuator // nil when autoscaler is disabled + checkpointStore *storage.CheckpointStore // nil if hibernation not configured + sandboxDomain string // base domain for sandbox subdomains + cfClient *cloudflare.Client // nil if Cloudflare not configured + pendingCreates sync.Map // map[sandboxID]*pendingCreate — async sandbox creation tracking + sandboxAPIProxy *proxy.SandboxAPIProxy // nil except in server mode (proxies data-plane to workers) + stripeClient *billing.StripeClient // nil if Stripe not configured + redisClient *redis.Client // nil if Redis not configured (for health checks) + adminEvents *AdminEventBus // real-time event bus for admin dashboard + ready int32 // atomic: 1 = ready, 0 = not ready // Axiom log query (sandbox session logs read API). // Empty token = endpoint returns 503. @@ -94,34 +99,35 @@ func (s *Server) SetAxiomQueryConfig(queryToken, dataset string) { // pendingCreate tracks an async sandbox creation. type pendingCreate struct { ready chan struct{} // closed when creation completes - err error // set before closing ready + err error // set before closing ready } // ServerOpts holds optional dependencies for the API server. type ServerOpts struct { - Store *db.Store - JWTIssuer *auth.JWTIssuer - SessionJWTSecret string // shared edge↔CP HMAC secret; enables /internal/sandboxes/create - CFAdminSecret string // HMAC shared with CF CreditAccount DO; enables /admin/halt-org and /admin/resume-org - CFEventSecret string // HMAC shared with the api-edge Worker; enables /internal/secret-refresh and other edge-→cell push paths - CellID string // this control plane's cell_id - Mode string // "server", "worker", "combined" - WorkerID string - Region string - HTTPAddr string + Store *db.Store + JWTIssuer *auth.JWTIssuer + SessionJWTSecret string // shared edge↔CP HMAC secret; enables /internal/sandboxes/create + CFAdminSecret string // HMAC shared with CF CreditAccount DO; enables /admin/halt-org and /admin/resume-org + CFEventSecret string // HMAC shared with the api-edge Worker; enables /internal/secret-refresh and other edge-→cell push paths + CellID string // this control plane's cell_id + Mode string // "server", "worker", "combined" + WorkerID string + Region string + HTTPAddr string ExecSessionManager *sandbox.ExecSessionManager - SandboxDBs *sandbox.SandboxDBManager - Router *sandbox.SandboxRouter // nil in server-only mode - SandboxProxy *proxy.SandboxProxy // nil if subdomain routing not configured - ControlPlaneProxy *proxy.ControlPlaneProxy // nil except in server mode (routes subdomains to workers) - SandboxDomain string // base domain for sandbox subdomains - WorkOSConfig *auth.WorkOSConfig // nil if WorkOS not configured - WorkerRegistry *controlplane.RedisWorkerRegistry // nil in combined/worker mode - CheckpointStore *storage.CheckpointStore // nil if hibernation not configured - CFClient *cloudflare.Client // nil if Cloudflare not configured - SandboxAPIProxy *proxy.SandboxAPIProxy // nil except in server mode (proxies data-plane to workers) - StripeClient *billing.StripeClient // nil if Stripe not configured - RedisClient *redis.Client // nil if Redis not configured (for health checks) + SandboxDBs *sandbox.SandboxDBManager + Router *sandbox.SandboxRouter // nil in server-only mode + SandboxProxy *proxy.SandboxProxy // nil if subdomain routing not configured + ControlPlaneProxy *proxy.ControlPlaneProxy // nil except in server mode (routes subdomains to workers) + SandboxDomain string // base domain for sandbox subdomains + WorkOSConfig *auth.WorkOSConfig // nil if WorkOS not configured + WorkerRegistry *controlplane.RedisWorkerRegistry // nil in combined/worker mode + WorkerEvacuator *controlplane.Scaler // nil when autoscaler is disabled + CheckpointStore *storage.CheckpointStore // nil if hibernation not configured + CFClient *cloudflare.Client // nil if Cloudflare not configured + SandboxAPIProxy *proxy.SandboxAPIProxy // nil except in server mode (proxies data-plane to workers) + StripeClient *billing.StripeClient // nil if Stripe not configured + RedisClient *redis.Client // nil if Redis not configured (for health checks) } // NewServer creates a new API server with all routes configured. @@ -153,6 +159,7 @@ func NewServer(mgr sandbox.Manager, ptyMgr *sandbox.PTYManager, apiKey string, o s.sandboxDBs = opts.SandboxDBs s.router = opts.Router s.workerRegistry = opts.WorkerRegistry + s.workerEvacuator = opts.WorkerEvacuator s.checkpointStore = opts.CheckpointStore s.sandboxDomain = opts.SandboxDomain s.cfClient = opts.CFClient @@ -267,6 +274,7 @@ func NewServer(mgr sandbox.Manager, ptyMgr *sandbox.PTYManager, apiKey string, o admin.GET("/report", s.adminReport) admin.POST("/events/clear", s.adminClearEvents) admin.POST("/workers/:id/drain", s.adminSetWorkerDraining) + admin.POST("/workers/:id/evacuate", s.adminEvacuateWorker) admin.GET("/demo/migration", s.demoPingPongPage) admin.GET("/demo/chaos", s.demoChaosPage) diff --git a/internal/controlplane/scaler.go b/internal/controlplane/scaler.go index 29513177..c3389587 100644 --- a/internal/controlplane/scaler.go +++ b/internal/controlplane/scaler.go @@ -43,6 +43,8 @@ const ( evacuationBatchSize = 3 // sandboxes to migrate per eval cycle per worker evacuationCooldown = 60 * time.Second // per-worker cooldown between evacuation batches drainTimeout = 45 * time.Minute // max time to drain a worker via live migration (allows 30 sandboxes × 10min each in batches of 3) + drainBatchSuccessPause = 2 * time.Second // pause between successful drain batches + drainBatchFailurePause = 5 * time.Second // pause before retrying after a failed drain batch creationFailureThreshold = 3 // consecutive failures before exponential backoff creationBackoffMin = 1 * time.Minute // initial backoff after threshold hit @@ -243,6 +245,39 @@ func (s *Scaler) Stop() { s.wg.Wait() } +// EvacuateWorker starts the normal live-migration drain loop for a specific +// worker without terminating the machine when the worker becomes empty. This is +// intended for operator-triggered evacuation tests and spot-preemption drills. +func (s *Scaler) EvacuateWorker(_ context.Context, workerID string) error { + s.mu.Lock() + defer s.mu.Unlock() + + target := s.getWorkerInfo(workerID) + if target == nil { + return fmt.Errorf("worker %s not found", workerID) + } + if target.MachineID == "" { + return fmt.Errorf("worker %s has no machine id", workerID) + } + if !s.state.TryAcquireEvacuationLock() { + return fmt.Errorf("another evacuation is already running") + } + + s.state.SetDraining(target.MachineID, &drainState{ + WorkerID: target.ID, + MachineID: target.MachineID, + Region: target.Region, + StartedAt: time.Now(), + }) + + go func() { + defer s.state.ReleaseEvacuationLock() + s.drainWorker(target.ID, target.MachineID, target.Region) + }() + + return nil +} + func (s *Scaler) evaluate() { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() @@ -1239,9 +1274,9 @@ func (s *Scaler) drainWorker(workerID, machineID, region string) { } if batchFailed { - time.Sleep(5 * time.Second) + time.Sleep(drainBatchFailurePause) } else { - time.Sleep(2 * time.Second) + time.Sleep(drainBatchSuccessPause) } } } diff --git a/internal/controlplane/spot_evacuation_sim_test.go b/internal/controlplane/spot_evacuation_sim_test.go new file mode 100644 index 00000000..2edf435f --- /dev/null +++ b/internal/controlplane/spot_evacuation_sim_test.go @@ -0,0 +1,141 @@ +package controlplane + +import ( + "math" + "testing" + "time" +) + +type spotEvacuationScenario struct { + name string + sourceSlots int + sourceUsedPct int + spareSlots int + spareUsed int + migrationP95 time.Duration + noticeWindow time.Duration +} + +type spotEvacuationResult struct { + workloads int + spareAvailable int + moved int + remaining int + waves int + duration time.Duration +} + +func simulateSpotEvacuation(sc spotEvacuationScenario) spotEvacuationResult { + workloads := sc.sourceSlots * sc.sourceUsedPct / 100 + spareAvailable := sc.spareSlots - sc.spareUsed + if spareAvailable < 0 { + spareAvailable = 0 + } + + moved := workloads + if moved > spareAvailable { + moved = spareAvailable + } + remaining := workloads - moved + waves := int(math.Ceil(float64(moved) / float64(evacuationBatchSize))) + + var duration time.Duration + if waves > 0 { + duration = time.Duration(waves)*sc.migrationP95 + time.Duration(waves-1)*drainBatchSuccessPause + } + + return spotEvacuationResult{ + workloads: workloads, + spareAvailable: spareAvailable, + moved: moved, + remaining: remaining, + waves: waves, + duration: duration, + } +} + +func TestSpotEvacuationScenariosWithOneSpareWorker(t *testing.T) { + const ( + sourceSlots = 100 + spareSlots = 100 + ) + + cases := []struct { + sc spotEvacuationScenario + wantWithinNotice bool + }{ + { + sc: spotEvacuationScenario{ + name: "source 25 percent full", + sourceSlots: sourceSlots, + sourceUsedPct: 25, + spareSlots: spareSlots, + migrationP95: 5 * time.Second, + noticeWindow: 2 * time.Minute, + }, + wantWithinNotice: true, + }, + { + sc: spotEvacuationScenario{ + name: "source 50 percent full", + sourceSlots: sourceSlots, + sourceUsedPct: 50, + spareSlots: spareSlots, + migrationP95: 5 * time.Second, + noticeWindow: 2 * time.Minute, + }, + wantWithinNotice: true, + }, + { + sc: spotEvacuationScenario{ + name: "source 75 percent full", + sourceSlots: sourceSlots, + sourceUsedPct: 75, + spareSlots: spareSlots, + migrationP95: 5 * time.Second, + noticeWindow: 2 * time.Minute, + }, + wantWithinNotice: false, + }, + } + + for _, tc := range cases { + t.Run(tc.sc.name, func(t *testing.T) { + got := simulateSpotEvacuation(tc.sc) + + if got.remaining != 0 { + t.Fatalf("expected spare worker to fit all workloads, moved=%d remaining=%d", got.moved, got.remaining) + } + + withinNotice := got.duration <= tc.sc.noticeWindow + if withinNotice != tc.wantWithinNotice { + t.Fatalf("duration=%s waves=%d workloads=%d, within notice=%v want %v", + got.duration, got.waves, got.workloads, withinNotice, tc.wantWithinNotice) + } + + t.Logf("%s: workloads=%d waves=%d estimated=%s notice=%s", + tc.sc.name, got.workloads, got.waves, got.duration, tc.sc.noticeWindow) + }) + } +} + +func TestSpotEvacuationRequiresEnoughSpareCapacity(t *testing.T) { + sc := spotEvacuationScenario{ + name: "50 percent source, half-used spare", + sourceSlots: 100, + sourceUsedPct: 50, + spareSlots: 80, + spareUsed: 40, + migrationP95: 5 * time.Second, + noticeWindow: 2 * time.Minute, + } + + got := simulateSpotEvacuation(sc) + + if got.spareAvailable != 40 { + t.Fatalf("expected 40 spare slots, got %d", got.spareAvailable) + } + if got.moved != 40 || got.remaining != 10 { + t.Fatalf("expected partial evacuation: moved=40 remaining=10, got moved=%d remaining=%d", got.moved, got.remaining) + } +} From 5c80187990856b4a8896be35fe22ef2b716c46a6 Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Tue, 2 Jun 2026 14:09:50 -0700 Subject: [PATCH 07/32] Revert "Improve worker migration readiness startup" This reverts commit d42fffea80b6f30528005e0097037c06242f993f. --- cmd/worker/main.go | 29 +--- deploy/packer/worker-ami-aws.pkr.hcl | 29 ---- internal/api/sandbox.go | 11 -- internal/controlplane/redis_registry.go | 79 ++++------- internal/controlplane/scaler.go | 3 - internal/controlplane/scaler_test.go | 65 +-------- .../controlplane/worker_readiness_test.go | 26 ---- internal/controlplane/worker_registry.go | 74 +++------- internal/qemu/manager.go | 129 +++++------------- internal/worker/redis_heartbeat.go | 122 ++++++----------- 10 files changed, 130 insertions(+), 437 deletions(-) delete mode 100644 internal/controlplane/worker_readiness_test.go diff --git a/cmd/worker/main.go b/cmd/worker/main.go index de2b44a3..1f4c0d04 100644 --- a/cmd/worker/main.go +++ b/cmd/worker/main.go @@ -267,8 +267,9 @@ func main() { // capacity. See internal/qemu/orphan_reaper.go. qmMgr.StartOrphanReaper(ctx) - if err := qmMgr.LoadGoldenVersionFromImage(); err != nil { - log.Printf("opensandbox-worker: WARNING: base golden version not available yet: %v", err) + // Prepare golden snapshot for fast VM creation + if err := qmMgr.PrepareGoldenSnapshot(); err != nil { + log.Printf("opensandbox-worker: WARNING: golden snapshot failed, using cold boot: %v", err) } mgr = qmMgr @@ -607,8 +608,6 @@ func main() { } }() - var hb *worker.RedisHeartbeat - // Redis heartbeat if cfg.RedisURL != "" { grpcAdvertise := grpcAddr @@ -616,16 +615,13 @@ func main() { grpcAdvertise = addr } - var err error - hb, err = worker.NewRedisHeartbeat(cfg.RedisURL, cfg.WorkerID, cfg.Region, grpcAdvertise, cfg.HTTPAddr) + hb, err := worker.NewRedisHeartbeat(cfg.RedisURL, cfg.WorkerID, cfg.Region, grpcAdvertise, cfg.HTTPAddr) if err != nil { log.Printf("opensandbox-worker: Redis heartbeat not available: %v", err) } else { hb.SetWorkerVersion(WorkerVersion) if qemuMgr != nil { hb.SetGoldenVersion(qemuMgr.GoldenVersion()) - hb.SetAcceptsCreates(false) - hb.SetAcceptsMigrations(qemuMgr.GoldenVersion() != "") } if envID := os.Getenv("OPENSANDBOX_MACHINE_ID"); envID != "" { hb.SetMachineID(envID) @@ -723,23 +719,6 @@ func main() { } } - if qemuMgr != nil { - go func() { - log.Println("opensandbox-worker: preparing golden snapshot in background") - if err := qemuMgr.PrepareGoldenSnapshot(); err != nil { - log.Printf("opensandbox-worker: WARNING: golden snapshot failed, using cold boot: %v", err) - } - if hb != nil { - hb.SetGoldenVersion(qemuMgr.GoldenVersion()) - hb.SetAcceptsCreates(true) - if qemuMgr.GoldenVersion() != "" { - hb.SetAcceptsMigrations(true) - } - } - log.Println("opensandbox-worker: create readiness enabled") - }() - } - // CF-parallel: Redis Streams event publisher. Inert unless CellID is set. // (The legacy NATS publisher used to run alongside this; it was removed // once Redis Streams covered all event types end-to-end. NATSURL in the diff --git a/deploy/packer/worker-ami-aws.pkr.hcl b/deploy/packer/worker-ami-aws.pkr.hcl index 04fee27a..9a7b4edf 100644 --- a/deploy/packer/worker-ami-aws.pkr.hcl +++ b/deploy/packer/worker-ami-aws.pkr.hcl @@ -20,15 +20,10 @@ # # 1. Build binaries for linux/amd64: # CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -ldflags "-X main.WorkerVersion=$(git rev-parse --short HEAD)" \ # -o bin/opensandbox-worker ./cmd/worker/ -# CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -ldflags "-X main.ServerVersion=$(git rev-parse --short HEAD)" \ -# -o bin/opensandbox-server ./cmd/server/ # CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o bin/osb-agent ./cmd/agent/ # # # 2. Build the rootfs context tarball: # tar czf /tmp/packer-rootfs-ctx.tar.gz deploy/firecracker/rootfs/ deploy/ec2/build-rootfs-docker.sh scripts/claude-agent-wrapper/ -# npm --prefix web ci -# npm --prefix web run build -# tar czf /tmp/packer-web-dist.tar.gz web/dist/ # # # 3. Run packer: # packer init deploy/packer/worker-ami-aws.pkr.hcl @@ -76,11 +71,6 @@ variable "worker_binary" { default = "bin/opensandbox-worker" } -variable "server_binary" { - type = string - default = "bin/opensandbox-server" -} - variable "agent_binary" { type = string default = "bin/osb-agent" @@ -98,12 +88,6 @@ variable "vector_context" { description = "Pre-built tarball of deploy/vector/ (config + populator + units). Pre-create with: tar czf /tmp/packer-vector-ctx.tar.gz deploy/vector/" } -variable "web_dist_context" { - type = string - default = "/tmp/packer-web-dist.tar.gz" - description = "Pre-built tarball of web/dist/. Pre-create with: npm --prefix web ci && npm --prefix web run build && tar czf /tmp/packer-web-dist.tar.gz web/dist/" -} - variable "golden_cache_bucket" { type = string default = "" @@ -178,10 +162,6 @@ build { source = var.worker_binary destination = "/tmp/opensandbox-worker" } - provisioner "file" { - source = var.server_binary - destination = "/tmp/opensandbox-server" - } provisioner "file" { source = var.agent_binary destination = "/tmp/osb-agent" @@ -208,10 +188,6 @@ build { source = var.vector_context destination = "/tmp/vector-ctx.tar.gz" } - provisioner "file" { - source = var.web_dist_context - destination = "/tmp/web-dist.tar.gz" - } provisioner "shell" { inline = [ "mkdir -p /tmp/vector", @@ -244,13 +220,8 @@ build { # Install worker + agent binaries. "mv /tmp/opensandbox-worker /usr/local/bin/opensandbox-worker", "chmod +x /usr/local/bin/opensandbox-worker", - "mv /tmp/opensandbox-server /usr/local/bin/opensandbox-server", - "chmod +x /usr/local/bin/opensandbox-server", "mv /tmp/osb-agent /usr/local/bin/osb-agent", "chmod +x /usr/local/bin/osb-agent", - "mkdir -p /usr/local/bin/web", - "tar xzf /tmp/web-dist.tar.gz -C /usr/local/bin/web --strip-components=1", - "rm /tmp/web-dist.tar.gz", # Install systemd unit. "mv /tmp/opensandbox-worker.service /etc/systemd/system/opensandbox-worker.service", diff --git a/internal/api/sandbox.go b/internal/api/sandbox.go index 5b0d09ee..dbd4b400 100644 --- a/internal/api/sandbox.go +++ b/internal/api/sandbox.go @@ -1020,14 +1020,6 @@ func (s *Server) migrateSandbox(c echo.Context) error { return c.JSON(http.StatusBadRequest, map[string]string{"error": "sandbox must be running to migrate"}) } - targetWorker := s.workerRegistry.GetWorker(req.TargetWorker) - if targetWorker == nil { - return c.JSON(http.StatusServiceUnavailable, map[string]string{"error": "target worker not found"}) - } - if !targetWorker.AcceptsMigrationRouting() { - return c.JSON(http.StatusServiceUnavailable, map[string]string{"error": "target worker is not accepting migrations"}) - } - // Mark as migrating — blocks exec/proxy routing until migration completes migrationDone := false if s.store != nil { @@ -1517,9 +1509,6 @@ func (s *Server) findScaleMigrationTargets(sourceWorkerID string, requestedMemMB if w.ID == sourceWorkerID { continue } - if !w.AcceptsMigrationRouting() { - continue - } if w.Draining { continue } diff --git a/internal/controlplane/redis_registry.go b/internal/controlplane/redis_registry.go index 2b72d187..745fd765 100644 --- a/internal/controlplane/redis_registry.go +++ b/internal/controlplane/redis_registry.go @@ -52,22 +52,20 @@ const ( // WorkerEntry represents a worker in the Redis-backed registry. type WorkerEntry struct { - ID string `json:"worker_id"` - MachineID string `json:"machine_id,omitempty"` // EC2 instance ID - Region string `json:"region"` - GRPCAddr string `json:"grpc_addr"` - HTTPAddr string `json:"http_addr"` - Capacity int `json:"capacity"` - Current int `json:"current"` - CPUPct float64 `json:"cpu_pct"` - MemPct float64 `json:"mem_pct"` + ID string `json:"worker_id"` + MachineID string `json:"machine_id,omitempty"` // EC2 instance ID + Region string `json:"region"` + GRPCAddr string `json:"grpc_addr"` + HTTPAddr string `json:"http_addr"` + Capacity int `json:"capacity"` + Current int `json:"current"` + CPUPct float64 `json:"cpu_pct"` + MemPct float64 `json:"mem_pct"` DiskPct float64 `json:"disk_pct"` TotalMemoryMB int `json:"total_memory_mb,omitempty"` CommittedMemoryMB int `json:"committed_memory_mb,omitempty"` GoldenVersion string `json:"golden_version,omitempty"` WorkerVersion string `json:"worker_version,omitempty"` - AcceptsCreates bool `json:"accepts_creates,omitempty"` - AcceptsMigrations bool `json:"accepts_migrations,omitempty"` Draining bool `json:"draining,omitempty"` // Per-sandbox stats published by the worker. Bounded by per-host sandbox @@ -90,13 +88,13 @@ type SandboxStats struct { // backed by Redis pub/sub for real-time updates and periodic SCAN for reconciliation. // It also maintains a persistent gRPC connection pool to workers. type RedisWorkerRegistry struct { - rdb *redis.Client - mu sync.RWMutex - workers map[string]*WorkerEntry // in-memory hot cache - conns map[string]*grpc.ClientConn // persistent gRPC connections - clients map[string]pb.SandboxWorkerClient // cached gRPC clients - rrCounter uint64 // round-robin counter for tie-breaking - stop chan struct{} + rdb *redis.Client + mu sync.RWMutex + workers map[string]*WorkerEntry // in-memory hot cache + conns map[string]*grpc.ClientConn // persistent gRPC connections + clients map[string]pb.SandboxWorkerClient // cached gRPC clients + rrCounter uint64 // round-robin counter for tie-breaking + stop chan struct{} // onWorkerRejoined fires when a worker registers — both genuinely new and // after being pruned for missed heartbeats. Used by the reconcile-on- @@ -316,8 +314,6 @@ func (r *RedisWorkerRegistry) reconcileAndPrune() { // handleHeartbeat updates the in-memory worker map and dials gRPC if this is a new worker. func (r *RedisWorkerRegistry) handleHeartbeat(entry WorkerEntry) { - normalizeWorkerEntryReadiness(&entry) - // Read drain state outside the lock — this is a network call and we don't // want to block other registry ops on Redis latency. The drain key is the // cross-CP source of truth; per-CP SetDraining writes it on the admin @@ -341,8 +337,6 @@ func (r *RedisWorkerRegistry) handleHeartbeat(entry WorkerEntry) { existing.DiskPct = entry.DiskPct existing.TotalMemoryMB = entry.TotalMemoryMB existing.CommittedMemoryMB = entry.CommittedMemoryMB - existing.AcceptsCreates = entry.AcceptsCreates - existing.AcceptsMigrations = entry.AcceptsMigrations existing.Draining = drainOverride if entry.GoldenVersion != "" { existing.GoldenVersion = entry.GoldenVersion @@ -368,7 +362,7 @@ func (r *RedisWorkerRegistry) handleHeartbeat(entry WorkerEntry) { // during the unreachable window. See internal/controlplane/reconcile.go. entry.Draining = drainOverride r.workers[entry.ID] = &entry - log.Printf("redis_registry: new worker registered: %s (region=%s, grpc=%s, draining=%v, acceptsCreates=%v, acceptsMigrations=%v)", entry.ID, entry.Region, entry.GRPCAddr, drainOverride, entry.AcceptsCreates, entry.AcceptsMigrations) + log.Printf("redis_registry: new worker registered: %s (region=%s, grpc=%s, draining=%v)", entry.ID, entry.Region, entry.GRPCAddr, drainOverride) if r.onWorkerRejoined != nil { // Fire in a goroutine — reconcile may take a few seconds (DB query // + a DestroySandbox RPC per stale entry) and we don't want @@ -445,16 +439,6 @@ func (r *RedisWorkerRegistry) handleHeartbeat(entry WorkerEntry) { } } -func normalizeWorkerEntryReadiness(w *WorkerEntry) { - // Older workers did not publish these booleans. If both are absent they - // decode as false; preserve legacy behavior by treating them as ready for - // both placement classes. - if !w.AcceptsCreates && !w.AcceptsMigrations { - w.AcceptsCreates = true - w.AcceptsMigrations = true - } -} - // dialWorkerLocked dials a gRPC connection to a worker. Must be called with r.mu held. func (r *RedisWorkerRegistry) dialWorkerLocked(workerID, grpcAddr string) { creds, err := grpctls.ClientCredentials() @@ -598,9 +582,6 @@ func (r *RedisWorkerRegistry) collectEligibleLocked(region string, anyRegion boo if w.Draining { continue } - if !w.AcceptsCreateRouting() { - continue - } if w.CPUPct >= routingHardCapPct || w.MemPct >= routingHardCapPct || w.DiskPct >= routingHardCapPct { continue } @@ -770,22 +751,20 @@ func (r *RedisWorkerRegistry) GetWorkersByRegion(region string) []*WorkerInfo { for _, w := range r.workers { if w.Region == region { result = append(result, &WorkerInfo{ - ID: w.ID, - MachineID: w.MachineID, - Region: w.Region, - GRPCAddr: w.GRPCAddr, - HTTPAddr: w.HTTPAddr, - Capacity: w.Capacity, - Current: w.Current, - CPUPct: w.CPUPct, - MemPct: w.MemPct, - DiskPct: w.DiskPct, + ID: w.ID, + MachineID: w.MachineID, + Region: w.Region, + GRPCAddr: w.GRPCAddr, + HTTPAddr: w.HTTPAddr, + Capacity: w.Capacity, + Current: w.Current, + CPUPct: w.CPUPct, + MemPct: w.MemPct, + DiskPct: w.DiskPct, TotalMemoryMB: w.TotalMemoryMB, CommittedMemoryMB: w.CommittedMemoryMB, - GoldenVersion: w.GoldenVersion, - WorkerVersion: w.WorkerVersion, - AcceptsCreates: w.AcceptsCreates, - AcceptsMigrations: w.AcceptsMigrations, + GoldenVersion: w.GoldenVersion, + WorkerVersion: w.WorkerVersion, }) } } diff --git a/internal/controlplane/scaler.go b/internal/controlplane/scaler.go index c3389587..03d731ab 100644 --- a/internal/controlplane/scaler.go +++ b/internal/controlplane/scaler.go @@ -785,9 +785,6 @@ func (s *Scaler) findMigrationTarget(region, excludeWorkerID string, requiredMem if w.ID == excludeWorkerID { continue } - if !w.AcceptsMigrationRouting() { - continue - } if s.state.IsDraining(w.MachineID) { continue } diff --git a/internal/controlplane/scaler_test.go b/internal/controlplane/scaler_test.go index 5c3eff91..663bed17 100644 --- a/internal/controlplane/scaler_test.go +++ b/internal/controlplane/scaler_test.go @@ -173,12 +173,12 @@ func (p *mockPool) DestroyMachine(_ context.Context, machineID string) error { return nil } -func (p *mockPool) DrainMachine(_ context.Context, _ string) error { return nil } -func (p *mockPool) StartMachine(_ context.Context, _ string) error { return nil } -func (p *mockPool) StopMachine(_ context.Context, _ string) error { return nil } -func (p *mockPool) HealthCheck(_ context.Context, _ string) error { return nil } -func (p *mockPool) CleanupOrphanedResources(_ context.Context) (int, error) { return 0, nil } -func (p *mockPool) ListMachines(_ context.Context) ([]*compute.Machine, error) { return nil, nil } +func (p *mockPool) DrainMachine(_ context.Context, _ string) error { return nil } +func (p *mockPool) StartMachine(_ context.Context, _ string) error { return nil } +func (p *mockPool) StopMachine(_ context.Context, _ string) error { return nil } +func (p *mockPool) HealthCheck(_ context.Context, _ string) error { return nil } +func (p *mockPool) CleanupOrphanedResources(_ context.Context) (int, error) { return 0, nil } +func (p *mockPool) ListMachines(_ context.Context) ([]*compute.Machine, error) { return nil, nil } func (p *mockPool) SupportedRegions(_ context.Context) ([]string, error) { return []string{"us-east-1"}, nil } @@ -676,59 +676,6 @@ func TestFindMigrationTargetSelectsLeastLoaded(t *testing.T) { } } -func TestFindMigrationTargetSkipsMigrationDisabledWorker(t *testing.T) { - reg := newMockRegistry() - pool := newMockPool() - - reg.addWorker(&WorkerInfo{ - ID: "source", MachineID: "osb-worker-source", Region: "us-east-1", - Capacity: 50, Current: 20, CPUPct: 70, MemPct: 50, DiskPct: 30, - }) - reg.addWorker(&WorkerInfo{ - ID: "disabled", MachineID: "osb-worker-disabled", Region: "us-east-1", - Capacity: 50, Current: 1, CPUPct: 5, MemPct: 5, DiskPct: 5, - AcceptsCreates: true, AcceptsMigrations: false, - }) - reg.addWorker(&WorkerInfo{ - ID: "ready", MachineID: "osb-worker-ready", Region: "us-east-1", - Capacity: 50, Current: 10, CPUPct: 20, MemPct: 20, DiskPct: 20, - AcceptsCreates: true, AcceptsMigrations: true, - }) - - s := newTestScaler(reg, pool) - target := s.findMigrationTarget("us-east-1", "source", 0) - if target == nil { - t.Fatal("expected a migration target") - } - if target.ID != "ready" { - t.Errorf("expected ready worker as target, got %s", target.ID) - } -} - -func TestFindMigrationTargetAllowsCreateDisabledWorker(t *testing.T) { - reg := newMockRegistry() - pool := newMockPool() - - reg.addWorker(&WorkerInfo{ - ID: "source", MachineID: "osb-worker-source", Region: "us-east-1", - Capacity: 50, Current: 20, CPUPct: 70, MemPct: 50, DiskPct: 30, - }) - reg.addWorker(&WorkerInfo{ - ID: "target", MachineID: "osb-worker-target", Region: "us-east-1", - Capacity: 50, Current: 1, CPUPct: 5, MemPct: 5, DiskPct: 5, - AcceptsCreates: false, AcceptsMigrations: true, - }) - - s := newTestScaler(reg, pool) - target := s.findMigrationTarget("us-east-1", "source", 0) - if target == nil { - t.Fatal("expected a migration target") - } - if target.ID != "target" { - t.Errorf("expected target worker as target, got %s", target.ID) - } -} - func TestFindMigrationTargetSkipsPressuredWorkers(t *testing.T) { reg := newMockRegistry() pool := newMockPool() diff --git a/internal/controlplane/worker_readiness_test.go b/internal/controlplane/worker_readiness_test.go deleted file mode 100644 index 045bd3a6..00000000 --- a/internal/controlplane/worker_readiness_test.go +++ /dev/null @@ -1,26 +0,0 @@ -package controlplane - -import "testing" - -func TestWorkerEntryRoutingReadiness(t *testing.T) { - legacy := &WorkerEntry{} - if !legacy.AcceptsCreateRouting() || !legacy.AcceptsMigrationRouting() { - t.Fatal("legacy worker with no readiness fields should accept both routing classes") - } - - migrationOnly := &WorkerEntry{AcceptsCreates: false, AcceptsMigrations: true} - if migrationOnly.AcceptsCreateRouting() { - t.Fatal("migration-only worker should not accept create routing") - } - if !migrationOnly.AcceptsMigrationRouting() { - t.Fatal("migration-only worker should accept migration routing") - } - - createOnly := &WorkerEntry{AcceptsCreates: true, AcceptsMigrations: false} - if !createOnly.AcceptsCreateRouting() { - t.Fatal("create-only worker should accept create routing") - } - if createOnly.AcceptsMigrationRouting() { - t.Fatal("create-only worker should not accept migration routing") - } -} diff --git a/internal/controlplane/worker_registry.go b/internal/controlplane/worker_registry.go index 0e05b9e9..7dc9e77f 100644 --- a/internal/controlplane/worker_registry.go +++ b/internal/controlplane/worker_registry.go @@ -13,24 +13,22 @@ import ( // WorkerInfo represents a registered worker. type WorkerInfo struct { - ID string `json:"worker_id"` - MachineID string `json:"machine_id,omitempty"` // EC2 instance ID - Region string `json:"region"` - GRPCAddr string `json:"grpc_addr"` - HTTPAddr string `json:"http_addr"` - Capacity int `json:"capacity"` - Current int `json:"current"` - CPUPct float64 `json:"cpu_pct"` - MemPct float64 `json:"mem_pct"` - DiskPct float64 `json:"disk_pct"` - TotalMemoryMB int `json:"total_memory_mb,omitempty"` - CommittedMemoryMB int `json:"committed_memory_mb,omitempty"` - GoldenVersion string `json:"golden_version,omitempty"` - WorkerVersion string `json:"worker_version,omitempty"` - AcceptsCreates bool `json:"accepts_creates,omitempty"` - AcceptsMigrations bool `json:"accepts_migrations,omitempty"` - LastSeen time.Time `json:"-"` - MissedBeats int `json:"-"` + ID string `json:"worker_id"` + MachineID string `json:"machine_id,omitempty"` // EC2 instance ID + Region string `json:"region"` + GRPCAddr string `json:"grpc_addr"` + HTTPAddr string `json:"http_addr"` + Capacity int `json:"capacity"` + Current int `json:"current"` + CPUPct float64 `json:"cpu_pct"` + MemPct float64 `json:"mem_pct"` + DiskPct float64 `json:"disk_pct"` + TotalMemoryMB int `json:"total_memory_mb,omitempty"` + CommittedMemoryMB int `json:"committed_memory_mb,omitempty"` + GoldenVersion string `json:"golden_version,omitempty"` + WorkerVersion string `json:"worker_version,omitempty"` + LastSeen time.Time `json:"-"` + MissedBeats int `json:"-"` } // WorkerRegistry tracks live workers from NATS heartbeats. @@ -127,9 +125,6 @@ func (r *WorkerRegistry) GetLeastLoadedWorker(region string) *WorkerInfo { var best *WorkerInfo bestScore := -1.0 for _, w := range workers { - if !w.AcceptsCreateRouting() { - continue - } remaining := w.Capacity - w.Current if remaining <= 0 { continue @@ -231,7 +226,6 @@ func (r *WorkerRegistry) handleHeartbeat(msg *nats.Msg) { if err := json.Unmarshal(msg.Data, &hb); err != nil { return } - normalizeWorkerInfoReadiness(&hb) r.mu.Lock() defer r.mu.Unlock() @@ -243,8 +237,6 @@ func (r *WorkerRegistry) handleHeartbeat(msg *nats.Msg) { existing.CPUPct = hb.CPUPct existing.MemPct = hb.MemPct existing.DiskPct = hb.DiskPct - existing.AcceptsCreates = hb.AcceptsCreates - existing.AcceptsMigrations = hb.AcceptsMigrations if hb.GoldenVersion != "" { existing.GoldenVersion = hb.GoldenVersion } @@ -269,40 +261,6 @@ func (r *WorkerRegistry) handleHeartbeat(msg *nats.Msg) { } } -func normalizeWorkerInfoReadiness(w *WorkerInfo) { - // Older workers did not publish these booleans. If both are absent they - // decode as false; preserve legacy behavior by treating them as ready for - // both placement classes. - if !w.AcceptsCreates && !w.AcceptsMigrations { - w.AcceptsCreates = true - w.AcceptsMigrations = true - } -} - -// AcceptsCreateRouting returns whether this worker should receive new sandbox -// creates. Both fields false is treated as legacy/unknown and therefore ready. -func (w *WorkerInfo) AcceptsCreateRouting() bool { - return w.AcceptsCreates || (!w.AcceptsCreates && !w.AcceptsMigrations) -} - -// AcceptsMigrationRouting returns whether this worker should receive incoming -// live migrations. Both fields false is treated as legacy/unknown and ready. -func (w *WorkerInfo) AcceptsMigrationRouting() bool { - return w.AcceptsMigrations || (!w.AcceptsCreates && !w.AcceptsMigrations) -} - -// AcceptsCreateRouting returns whether this worker should receive new sandbox -// creates. Both fields false is treated as legacy/unknown and therefore ready. -func (w *WorkerEntry) AcceptsCreateRouting() bool { - return w.AcceptsCreates || (!w.AcceptsCreates && !w.AcceptsMigrations) -} - -// AcceptsMigrationRouting returns whether this worker should receive incoming -// live migrations. Both fields false is treated as legacy/unknown and ready. -func (w *WorkerEntry) AcceptsMigrationRouting() bool { - return w.AcceptsMigrations || (!w.AcceptsCreates && !w.AcceptsMigrations) -} - func (r *WorkerRegistry) checkStaleWorkers() { r.mu.Lock() defer r.mu.Unlock() diff --git a/internal/qemu/manager.go b/internal/qemu/manager.go index 9a117cbe..7c63aa4a 100644 --- a/internal/qemu/manager.go +++ b/internal/qemu/manager.go @@ -454,69 +454,9 @@ func (m *Manager) SetHibernationUploadCallback(cb func(sandboxID, hibernationKey // GoldenVersion returns the hash identifying this worker's golden snapshot base image. // Empty string means no golden snapshot is available. func (m *Manager) GoldenVersion() string { - m.mu.RLock() - defer m.mu.RUnlock() return m.goldenVersion } -// LoadGoldenVersionFromImage seeds GoldenVersion from the baked image metadata. -// It is intentionally lightweight so the worker can advertise its base version -// before the runtime golden memory snapshot has finished preparing. -func (m *Manager) LoadGoldenVersionFromImage() error { - baseImage, err := ResolveBaseImage(m.cfg.ImagesDir, "default") - if err != nil { - return fmt.Errorf("resolve base image: %w", err) - } - v, err := m.computeBaseGoldenVersion(baseImage) - if err != nil { - return err - } - m.mu.Lock() - m.goldenVersion = v - m.mu.Unlock() - log.Printf("qemu: loaded base golden version %s", v) - return nil -} - -func (m *Manager) computeBaseGoldenVersion(baseImage string) (string, error) { - versionPath := filepath.Join(m.cfg.ImagesDir, "golden-version") - if b, err := os.ReadFile(versionPath); err == nil { - if v := strings.TrimSpace(string(b)); v != "" { - return v, nil - } - } - return computeGoldenVersion(baseImage) -} - -func (m *Manager) setGoldenSnapshot(dir, version string, cid uint32, guestIP, hostIP string) { - m.mu.Lock() - defer m.mu.Unlock() - m.goldenDir = dir - m.goldenVersion = version - m.goldenCID = cid - m.goldenGuestIP = guestIP - m.goldenHostIP = hostIP -} - -func (m *Manager) goldenSnapshot() (dir, version string) { - m.mu.RLock() - defer m.mu.RUnlock() - return m.goldenDir, m.goldenVersion -} - -func (m *Manager) setGoldenDir(dir string) { - m.mu.Lock() - defer m.mu.Unlock() - m.goldenDir = dir -} - -func (m *Manager) restoreGoldenSnapshot(dir, version string) { - m.mu.Lock() - defer m.mu.Unlock() - m.goldenDir = dir - m.goldenVersion = version -} - // MemoryAllocatedBytes returns the sum of memory committed to currently-running // sandboxes, in bytes. Used by the worker's resource-stats tick to report // oversubscription independent of actual guest workload. @@ -788,34 +728,32 @@ func (m *Manager) PrepareGoldenSnapshot() error { stale := false baseImage, _ := ResolveBaseImage(m.cfg.ImagesDir, "default") if baseImage != "" && storedVersion != "" { - if currentHash, err := m.computeBaseGoldenVersion(baseImage); err == nil && currentHash != storedVersion { + if currentHash, err := computeGoldenVersion(baseImage); err == nil && currentHash != storedVersion { log.Printf("qemu: base image changed (golden=%s, disk=%s), rebuilding golden snapshot", storedVersion, currentHash) stale = true } } if !stale { - goldenVersion := storedVersion - var goldenCID uint32 - var goldenGuestIP, goldenHostIP string + m.goldenDir = goldenDir + m.goldenVersion = storedVersion if cidBytes, err := os.ReadFile(filepath.Join(goldenDir, "cid")); err == nil { - fmt.Sscanf(string(cidBytes), "%d", &goldenCID) + fmt.Sscanf(string(cidBytes), "%d", &m.goldenCID) } if ipBytes, err := os.ReadFile(filepath.Join(goldenDir, "guest_ip")); err == nil { - goldenGuestIP = string(ipBytes) + m.goldenGuestIP = string(ipBytes) } if ipBytes, err := os.ReadFile(filepath.Join(goldenDir, "host_ip")); err == nil { - goldenHostIP = string(ipBytes) + m.goldenHostIP = string(ipBytes) } if storedVersion == "" && baseImage != "" { - if v, err := m.computeBaseGoldenVersion(baseImage); err == nil { - goldenVersion = v + if v, err := computeGoldenVersion(baseImage); err == nil { + m.goldenVersion = v _ = os.WriteFile(versionFile, []byte(v), 0644) } } - m.setGoldenSnapshot(goldenDir, goldenVersion, goldenCID, goldenGuestIP, goldenHostIP) - log.Printf("qemu: golden snapshot already exists at %s (CID=%d, guestIP=%s, version=%s)", goldenDir, goldenCID, goldenGuestIP, goldenVersion) - go m.uploadBaseImageIfNew(goldenVersion) + log.Printf("qemu: golden snapshot already exists at %s (CID=%d, guestIP=%s, version=%s)", goldenDir, m.goldenCID, m.goldenGuestIP, m.goldenVersion) + go m.uploadBaseImageIfNew(m.goldenVersion) return nil } @@ -1029,22 +967,24 @@ func (m *Manager) PrepareGoldenSnapshot() error { } // Compute and persist golden version hash - goldenVersion := m.GoldenVersion() - if v, err := m.computeBaseGoldenVersion(baseImage); err == nil { - goldenVersion = v + if v, err := computeGoldenVersion(baseImage); err == nil { + m.goldenVersion = v _ = os.WriteFile(filepath.Join(goldenDir, "version"), []byte(v), 0644) } // Remove preparing marker — golden snapshot is complete os.Remove(preparingMarker) - m.setGoldenSnapshot(goldenDir, goldenVersion, goldenCID, netCfg.GuestIP, netCfg.HostIP) + m.goldenDir = goldenDir + m.goldenCID = goldenCID + m.goldenGuestIP = netCfg.GuestIP + m.goldenHostIP = netCfg.HostIP _ = os.WriteFile(filepath.Join(goldenDir, "cid"), []byte(fmt.Sprintf("%d", goldenCID)), 0644) _ = os.WriteFile(filepath.Join(goldenDir, "guest_ip"), []byte(netCfg.GuestIP), 0644) _ = os.WriteFile(filepath.Join(goldenDir, "host_ip"), []byte(netCfg.HostIP), 0644) log.Printf("qemu: golden snapshot ready (%dms total, mem=%s, CID=%d, guestIP=%s, version=%s)", - time.Since(t0).Milliseconds(), memFile, goldenCID, netCfg.GuestIP, goldenVersion) - go m.uploadBaseImageIfNew(goldenVersion) + time.Since(t0).Milliseconds(), memFile, goldenCID, netCfg.GuestIP, m.goldenVersion) + go m.uploadBaseImageIfNew(m.goldenVersion) return nil } @@ -1053,7 +993,7 @@ func (m *Manager) PrepareGoldenSnapshot() error { // independent reflink copies — only new sandboxes use the new golden. // Returns the old and new golden version strings. func (m *Manager) RebuildGoldenSnapshot() (oldVersion, newVersion string, err error) { - oldGoldenDir, oldVersion := m.goldenSnapshot() + oldVersion = m.goldenVersion goldenDir := filepath.Join(m.cfg.DataDir, "golden") // Build new golden in a staging directory @@ -1061,13 +1001,14 @@ func (m *Manager) RebuildGoldenSnapshot() (oldVersion, newVersion string, err er os.RemoveAll(stagingDir) // clean up any prior failed attempt // Temporarily point goldenDir to staging so PrepareGoldenSnapshot builds there - m.setGoldenDir("") + oldGoldenDir := m.goldenDir + m.goldenDir = "" // Rename current golden out of the way so PrepareGoldenSnapshot sees no existing snapshot backupDir := filepath.Join(m.cfg.DataDir, "golden-old") os.RemoveAll(backupDir) if err := os.Rename(goldenDir, backupDir); err != nil && !os.IsNotExist(err) { - m.setGoldenDir(oldGoldenDir) + m.goldenDir = oldGoldenDir return oldVersion, "", fmt.Errorf("backup old golden: %w", err) } @@ -1076,12 +1017,13 @@ func (m *Manager) RebuildGoldenSnapshot() (oldVersion, newVersion string, err er // Restore old golden on failure os.RemoveAll(goldenDir) if backupErr := os.Rename(backupDir, goldenDir); backupErr == nil { - m.restoreGoldenSnapshot(oldGoldenDir, oldVersion) + m.goldenDir = oldGoldenDir + m.goldenVersion = oldVersion } return oldVersion, "", fmt.Errorf("rebuild golden: %w", err) } - newVersion = m.GoldenVersion() + newVersion = m.goldenVersion // Clean up old golden — sandboxes created from it have independent reflink copies os.RemoveAll(backupDir) @@ -1095,10 +1037,6 @@ func (m *Manager) RebuildGoldenSnapshot() (oldVersion, newVersion string, err er // After restore, we patch the network config inside the guest. func (m *Manager) createFromGolden(ctx context.Context, cfg types.SandboxConfig, id string) (*types.Sandbox, error) { t0 := time.Now() - goldenDir, goldenVersion := m.goldenSnapshot() - if goldenDir == "" { - return nil, fmt.Errorf("golden snapshot not ready") - } template := cfg.Template if template == "" || template == "base" { @@ -1112,7 +1050,7 @@ func (m *Manager) createFromGolden(ctx context.Context, cfg types.SandboxConfig, // Copy golden rootfs as qcow2 overlay (golden snapshot was taken with qcow2 drives) rootfsPath := filepath.Join(sandboxDir, "rootfs.qcow2") - goldenRootfs := filepath.Join(goldenDir, "rootfs.qcow2") + goldenRootfs := filepath.Join(m.goldenDir, "rootfs.qcow2") if err := copyFileReflink(goldenRootfs, rootfsPath); err != nil { os.RemoveAll(sandboxDir) return nil, fmt.Errorf("copy golden rootfs: %w", err) @@ -1124,7 +1062,7 @@ func (m *Manager) createFromGolden(ctx context.Context, cfg types.SandboxConfig, workspacePath := filepath.Join(sandboxDir, "workspace.qcow2") diskMB := m.cfg.DefaultDiskMB var goldenWSUUID string - if data, readErr := os.ReadFile(filepath.Join(goldenDir, "workspace_uuid")); readErr == nil { + if data, readErr := os.ReadFile(filepath.Join(m.goldenDir, "workspace_uuid")); readErr == nil { goldenWSUUID = strings.TrimSpace(string(data)) } if err := CreateWorkspace(workspacePath, diskMB, goldenWSUUID); err != nil { @@ -1221,8 +1159,8 @@ func (m *Manager) createFromGolden(ctx context.Context, cfg types.SandboxConfig, // Build QEMU args with -incoming to restore from golden snapshot. // Use zstd-compressed mem file if available (less EBS I/O despite CPU cost). - goldenMemZst := filepath.Join(goldenDir, "mem.zst") - goldenMemRaw := filepath.Join(goldenDir, "mem") + goldenMemZst := filepath.Join(m.goldenDir, "mem.zst") + goldenMemRaw := filepath.Join(m.goldenDir, "mem") var incomingURI string if fileExists(goldenMemZst) { incomingURI = fmt.Sprintf("exec:zstdcat %s", goldenMemZst) @@ -1302,7 +1240,7 @@ func (m *Manager) createFromGolden(ctx context.Context, cfg types.SandboxConfig, guestMAC: guestMAC, guestCID: guestCID, bootArgs: bootArgs, - goldenVersion: goldenVersion, + goldenVersion: m.goldenVersion, } // Connect to agent via Unix socket @@ -1663,8 +1601,7 @@ func (m *Manager) Create(ctx context.Context, cfg types.SandboxConfig) (sb *type } // Fast path: restore from golden snapshot if available and using default template - goldenDir, _ := m.goldenSnapshot() - if goldenDir != "" && template == "default" && cfg.TemplateRootfsKey == "" { + if m.goldenDir != "" && template == "default" && cfg.TemplateRootfsKey == "" { sb, err := m.createFromGolden(ctx, cfg, id) if err != nil { log.Printf("qemu: golden restore failed for %s, falling back to cold boot: %v", id, err) @@ -1837,7 +1774,7 @@ func (m *Manager) Create(ctx context.Context, cfg types.SandboxConfig) (sb *type guestMAC: guestMAC, guestCID: guestCID, bootArgs: bootArgs, - goldenVersion: m.GoldenVersion(), // set even on cold boot — VM uses the same base image + goldenVersion: m.goldenVersion, // set even on cold boot — VM uses the same base image } // Wait for agent via Unix socket @@ -3800,7 +3737,7 @@ func (m *Manager) ForkFromCheckpoint(ctx context.Context, checkpointID string, c guestCID: guestCID, bootArgs: bootArgs, agent: agent, - goldenVersion: m.GoldenVersion(), // set on wake — VM uses the current base image + goldenVersion: m.goldenVersion, // set on wake — VM uses the current base image } m.mu.Lock() diff --git a/internal/worker/redis_heartbeat.go b/internal/worker/redis_heartbeat.go index e48b4de2..82c22d60 100644 --- a/internal/worker/redis_heartbeat.go +++ b/internal/worker/redis_heartbeat.go @@ -13,22 +13,20 @@ import ( // redisHeartbeatPayload is the JSON structure published to Redis. type redisHeartbeatPayload struct { - WorkerID string `json:"worker_id"` - MachineID string `json:"machine_id,omitempty"` // EC2 instance ID (e.g. i-099088f8ac4a34ef3) - Region string `json:"region"` - GRPCAddr string `json:"grpc_addr"` - HTTPAddr string `json:"http_addr"` - Capacity int `json:"capacity"` - Current int `json:"current"` - CPUPct float64 `json:"cpu_pct"` - MemPct float64 `json:"mem_pct"` + WorkerID string `json:"worker_id"` + MachineID string `json:"machine_id,omitempty"` // EC2 instance ID (e.g. i-099088f8ac4a34ef3) + Region string `json:"region"` + GRPCAddr string `json:"grpc_addr"` + HTTPAddr string `json:"http_addr"` + Capacity int `json:"capacity"` + Current int `json:"current"` + CPUPct float64 `json:"cpu_pct"` + MemPct float64 `json:"mem_pct"` DiskPct float64 `json:"disk_pct"` TotalMemoryMB int `json:"total_memory_mb,omitempty"` CommittedMemoryMB int `json:"committed_memory_mb,omitempty"` GoldenVersion string `json:"golden_version,omitempty"` WorkerVersion string `json:"worker_version,omitempty"` - AcceptsCreates bool `json:"accepts_creates,omitempty"` - AcceptsMigrations bool `json:"accepts_migrations,omitempty"` // Per-sandbox stats snapshot. Populated by the worker's stats collector // (see internal/qemu/stats_collector.go) and consumed by the CP autoscaler @@ -54,24 +52,21 @@ type SandboxStatsWire struct { // 1. SETs worker:{id} with a 30s TTL (auto-expires if worker dies) // 2. PUBLISHes to workers:heartbeat for real-time server notification type RedisHeartbeat struct { - rdb *redis.Client - workerID string - machineID string - region string - grpcAddr string - httpAddr string - getStats func() (capacity, current int, cpuPct, memPct, diskPct float64) - getMemoryInfo func() (totalMB, committedMB int) // optional: committed memory for dynamic capacity - getSandboxStats func() map[string]SandboxStatsWire // optional: per-sandbox stats for autoscaler - onReconnect func() // called when heartbeat succeeds after a previous failure - stateMu sync.RWMutex - goldenVersion string - workerVersion string - acceptsCreates bool - acceptsMigrations bool - wasDown bool // true if the last publish failed (used to detect reconnect) - stop chan struct{} - stopOnce sync.Once // guards close(stop) + rdb.Del — Stop() may be called from preemption handler and defer + rdb *redis.Client + workerID string + machineID string + region string + grpcAddr string + httpAddr string + getStats func() (capacity, current int, cpuPct, memPct, diskPct float64) + getMemoryInfo func() (totalMB, committedMB int) // optional: committed memory for dynamic capacity + getSandboxStats func() map[string]SandboxStatsWire // optional: per-sandbox stats for autoscaler + onReconnect func() // called when heartbeat succeeds after a previous failure + goldenVersion string + workerVersion string + wasDown bool // true if the last publish failed (used to detect reconnect) + stop chan struct{} + stopOnce sync.Once // guards close(stop) + rdb.Del — Stop() may be called from preemption handler and defer } // NewRedisHeartbeat creates a new heartbeat publisher. @@ -97,54 +92,30 @@ func NewRedisHeartbeat(redisURL, workerID, region, grpcAddr, httpAddr string) (* } return &RedisHeartbeat{ - rdb: rdb, - workerID: workerID, - region: region, - grpcAddr: grpcAddr, - httpAddr: httpAddr, - acceptsCreates: true, - acceptsMigrations: true, - stop: make(chan struct{}), + rdb: rdb, + workerID: workerID, + region: region, + grpcAddr: grpcAddr, + httpAddr: httpAddr, + stop: make(chan struct{}), }, nil } // SetMachineID sets the EC2 instance ID for the heartbeat (used by scaler for drain/terminate). func (h *RedisHeartbeat) SetMachineID(id string) { - h.stateMu.Lock() - defer h.stateMu.Unlock() h.machineID = id } // SetGoldenVersion sets the golden snapshot version hash for the heartbeat. func (h *RedisHeartbeat) SetGoldenVersion(v string) { - h.stateMu.Lock() - defer h.stateMu.Unlock() h.goldenVersion = v } // SetWorkerVersion sets the worker binary version (git SHA) for the heartbeat. func (h *RedisHeartbeat) SetWorkerVersion(v string) { - h.stateMu.Lock() - defer h.stateMu.Unlock() h.workerVersion = v } -// SetAcceptsCreates controls whether the control plane may route new sandbox -// creates to this worker. -func (h *RedisHeartbeat) SetAcceptsCreates(v bool) { - h.stateMu.Lock() - defer h.stateMu.Unlock() - h.acceptsCreates = v -} - -// SetAcceptsMigrations controls whether the control plane may choose this -// worker as an incoming live-migration target. -func (h *RedisHeartbeat) SetAcceptsMigrations(v bool) { - h.stateMu.Lock() - defer h.stateMu.Unlock() - h.acceptsMigrations = v -} - // SetMemoryInfoFunc sets a callback that returns host total and committed memory in MB. // Used for dynamic capacity reporting. func (h *RedisHeartbeat) SetMemoryInfoFunc(fn func() (totalMB, committedMB int)) { @@ -188,29 +159,20 @@ func (h *RedisHeartbeat) Start(getStats func() (int, int, float64, float64, floa func (h *RedisHeartbeat) publish() { capacity, current, cpuPct, memPct, diskPct := h.getStats() - h.stateMu.RLock() - machineID := h.machineID - goldenVersion := h.goldenVersion - workerVersion := h.workerVersion - acceptsCreates := h.acceptsCreates - acceptsMigrations := h.acceptsMigrations - h.stateMu.RUnlock() payload := redisHeartbeatPayload{ - WorkerID: h.workerID, - MachineID: machineID, - Region: h.region, - GRPCAddr: h.grpcAddr, - HTTPAddr: h.httpAddr, - Capacity: capacity, - Current: current, - CPUPct: cpuPct, - MemPct: memPct, - DiskPct: diskPct, - GoldenVersion: goldenVersion, - WorkerVersion: workerVersion, - AcceptsCreates: acceptsCreates, - AcceptsMigrations: acceptsMigrations, + WorkerID: h.workerID, + MachineID: h.machineID, + Region: h.region, + GRPCAddr: h.grpcAddr, + HTTPAddr: h.httpAddr, + Capacity: capacity, + Current: current, + CPUPct: cpuPct, + MemPct: memPct, + DiskPct: diskPct, + GoldenVersion: h.goldenVersion, + WorkerVersion: h.workerVersion, } // Add committed memory info for dynamic capacity From 35c0e94b45a590844878a303f8944f191debfe42 Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Tue, 2 Jun 2026 14:14:38 -0700 Subject: [PATCH 08/32] Fix scaler tests after readiness revert --- internal/controlplane/scaler_test.go | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/internal/controlplane/scaler_test.go b/internal/controlplane/scaler_test.go index 663bed17..86c39ba2 100644 --- a/internal/controlplane/scaler_test.go +++ b/internal/controlplane/scaler_test.go @@ -173,12 +173,12 @@ func (p *mockPool) DestroyMachine(_ context.Context, machineID string) error { return nil } -func (p *mockPool) DrainMachine(_ context.Context, _ string) error { return nil } -func (p *mockPool) StartMachine(_ context.Context, _ string) error { return nil } -func (p *mockPool) StopMachine(_ context.Context, _ string) error { return nil } -func (p *mockPool) HealthCheck(_ context.Context, _ string) error { return nil } -func (p *mockPool) CleanupOrphanedResources(_ context.Context) (int, error) { return 0, nil } -func (p *mockPool) ListMachines(_ context.Context) ([]*compute.Machine, error) { return nil, nil } +func (p *mockPool) DrainMachine(_ context.Context, _ string) error { return nil } +func (p *mockPool) StartMachine(_ context.Context, _ string) error { return nil } +func (p *mockPool) StopMachine(_ context.Context, _ string) error { return nil } +func (p *mockPool) HealthCheck(_ context.Context, _ string) error { return nil } +func (p *mockPool) CleanupOrphanedResources(_ context.Context) (int, error) { return 0, nil } +func (p *mockPool) ListMachines(_ context.Context) ([]*compute.Machine, error) { return nil, nil } func (p *mockPool) SupportedRegions(_ context.Context) ([]string, error) { return []string{"us-east-1"}, nil } @@ -414,14 +414,17 @@ func TestSmartScaleDownTargetsLeastLoaded(t *testing.T) { reg.addWorker(&WorkerInfo{ ID: "w1", MachineID: "osb-worker-w1", Region: "us-east-1", Capacity: 50, Current: 5, CPUPct: 20, MemPct: 20, DiskPct: 20, + TotalMemoryMB: 64000, CommittedMemoryMB: 12800, }) reg.addWorker(&WorkerInfo{ ID: "w2", MachineID: "osb-worker-w2", Region: "us-east-1", Capacity: 50, Current: 2, CPUPct: 10, MemPct: 10, DiskPct: 10, + TotalMemoryMB: 64000, CommittedMemoryMB: 6400, }) reg.addWorker(&WorkerInfo{ ID: "w3", MachineID: "osb-worker-w3", Region: "us-east-1", Capacity: 50, Current: 5, CPUPct: 20, MemPct: 20, DiskPct: 20, + TotalMemoryMB: 64000, CommittedMemoryMB: 12800, }) s := newTestScaler(reg, pool) @@ -463,10 +466,17 @@ func TestScaleDownSkipsAlreadyDraining(t *testing.T) { reg.addWorker(&WorkerInfo{ ID: "w2", MachineID: "osb-worker-w2", Region: "us-east-1", Capacity: 50, Current: 2, CPUPct: 10, MemPct: 10, DiskPct: 10, + TotalMemoryMB: 64000, CommittedMemoryMB: 6400, }) reg.addWorker(&WorkerInfo{ ID: "w3", MachineID: "osb-worker-w3", Region: "us-east-1", Capacity: 50, Current: 5, CPUPct: 20, MemPct: 20, DiskPct: 20, + TotalMemoryMB: 64000, CommittedMemoryMB: 12800, + }) + reg.addWorker(&WorkerInfo{ + ID: "w4", MachineID: "osb-worker-w4", Region: "us-east-1", + Capacity: 50, Current: 10, CPUPct: 20, MemPct: 20, DiskPct: 20, + TotalMemoryMB: 64000, CommittedMemoryMB: 12800, }) s := newTestScaler(reg, pool) @@ -857,7 +867,7 @@ func TestDrainTimeoutCancelsDrainKeepsWorker(t *testing.T) { WorkerID: "w2", MachineID: "osb-worker-w2", Region: "us-east-1", - StartedAt: time.Now().Add(-20 * time.Minute), // well past drainTimeout + StartedAt: time.Now().Add(-(drainTimeout + time.Minute)), }) ctx := context.Background() From 6787fe7c25ab6c62a7931c4fd8945f7ae8724d09 Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Tue, 2 Jun 2026 20:34:22 -0700 Subject: [PATCH 09/32] fix spot drain retry behavior --- cmd/agent/listen_linux.go | 12 +---- internal/controlplane/scaler.go | 65 +++++++++++++++++++++++----- internal/controlplane/scaler_test.go | 21 +++++++++ 3 files changed, 76 insertions(+), 22 deletions(-) diff --git a/cmd/agent/listen_linux.go b/cmd/agent/listen_linux.go index 06f52a8e..33a4f917 100644 --- a/cmd/agent/listen_linux.go +++ b/cmd/agent/listen_linux.go @@ -111,15 +111,6 @@ func (l *virtioSerialListener) Accept() (net.Conn, error) { } l.mu.Unlock() - // Drain any stale data from the previous gRPC session. - // After a gRPC connection drops, the serial port may have leftover - // bytes (GOAWAY frames, partial HTTP/2 frames). If we hand these to - // the new gRPC session, it gets a protocol error and closes immediately. - if instrumentVirtioSerial { - log.Printf("virtio-serial: Accept iter: active=false, draining stale...") - } - drainStaleData(l.f) - // Wait for the host to connect (port becomes readable with fresh data) if !waitForReadable(l.f, 500*time.Millisecond) { continue @@ -219,8 +210,7 @@ type virtioSerialConn struct { // instrumentVirtioSerial controls per-Read logging on the conn. Set true via // OSB_AGENT_TRACE_VIRTIO=1 env var to debug post-loadvm protocol confusion. -// HARDCODED TO TRUE FOR INVESTIGATION BUILD — revert before merging. -var instrumentVirtioSerial = true || os.Getenv("OSB_AGENT_TRACE_VIRTIO") == "1" +var instrumentVirtioSerial = os.Getenv("OSB_AGENT_TRACE_VIRTIO") == "1" func (c *virtioSerialConn) Read(b []byte) (int, error) { n, err := c.f.Read(b) diff --git a/internal/controlplane/scaler.go b/internal/controlplane/scaler.go index 03d731ab..d80a3113 100644 --- a/internal/controlplane/scaler.go +++ b/internal/controlplane/scaler.go @@ -45,6 +45,9 @@ const ( drainTimeout = 45 * time.Minute // max time to drain a worker via live migration (allows 30 sandboxes × 10min each in batches of 3) drainBatchSuccessPause = 2 * time.Second // pause between successful drain batches drainBatchFailurePause = 5 * time.Second // pause before retrying after a failed drain batch + spotDrainFailurePause = 1 * time.Second // spot/preemption drains should keep moving inside the 2min notice window + normalDrainBackoff = 5 * time.Minute // conservative retry backoff for ordinary scaler drains + spotDrainBackoff = 1 * time.Second // aggressive retry backoff for spot/preemption drains creationFailureThreshold = 3 // consecutive failures before exponential backoff creationBackoffMin = 1 * time.Minute // initial backoff after threshold hit @@ -114,6 +117,38 @@ type drainState struct { MachineID string `json:"machine_id"` Region string `json:"region"` StartedAt time.Time `json:"started_at"` + Reason string `json:"reason,omitempty"` +} + +const ( + drainReasonScaleDown = "scale_down" + drainReasonRollingReplace = "rolling_replace" + drainReasonSpotPreemption = "spot_preemption" + maxDrainMigrationFailures = 3 +) + +type drainPolicy struct { + reason string + backoff time.Duration + failurePause time.Duration +} + +func drainPolicyForReason(reason string) drainPolicy { + if reason == drainReasonSpotPreemption { + return drainPolicy{ + reason: reason, + backoff: spotDrainBackoff, + failurePause: spotDrainFailurePause, + } + } + if reason == "" { + reason = drainReasonScaleDown + } + return drainPolicy{ + reason: reason, + backoff: normalDrainBackoff, + failurePause: drainBatchFailurePause, + } } // Scaler manages autoscaling of workers via the compute Pool. @@ -268,11 +303,12 @@ func (s *Scaler) EvacuateWorker(_ context.Context, workerID string) error { MachineID: target.MachineID, Region: target.Region, StartedAt: time.Now(), + Reason: drainReasonSpotPreemption, }) go func() { defer s.state.ReleaseEvacuationLock() - s.drainWorker(target.ID, target.MachineID, target.Region) + s.drainWorker(target.ID, target.MachineID, target.Region, drainReasonSpotPreemption) }() return nil @@ -957,9 +993,10 @@ func (s *Scaler) smartScaleDown(_ context.Context, region string, workers []*Wor MachineID: target.MachineID, Region: region, StartedAt: time.Now(), + Reason: drainReasonScaleDown, }) - go s.drainWorker(target.ID, target.MachineID, region) + go s.drainWorker(target.ID, target.MachineID, region, drainReasonScaleDown) } // rollingReplace executes a quota-aware rolling replacement of stale workers @@ -1069,6 +1106,7 @@ func (s *Scaler) rollingReplace(ctx context.Context, region string, workers []*W MachineID: target.MachineID, Region: region, StartedAt: time.Now(), + Reason: drainReasonRollingReplace, }) // Run the dance in a goroutine so we don't block the scaler tick. @@ -1093,7 +1131,7 @@ func (s *Scaler) replaceOneStale(ctx context.Context, region string, target *Wor // fires. drainWorker handles per-sandbox findMigrationTarget so each // sandbox lands on whichever current-version worker has the most room // at that exact moment. - s.drainWorker(target.ID, target.MachineID, region) + s.drainWorker(target.ID, target.MachineID, region, drainReasonRollingReplace) // 2. Terminate the (now-empty) stale worker. Frees the quota slot for the // replacement scaleUp below. We do this even on partial drain — the @@ -1126,9 +1164,10 @@ func (s *Scaler) replaceOneStale(ctx context.Context, region string, target *Wor // (e.g., S3 auth, no targets), falls back to waiting for natural expiry — // sandboxes will timeout or be destroyed by users. No new sandboxes are routed // to draining workers. -func (s *Scaler) drainWorker(workerID, machineID, region string) { +func (s *Scaler) drainWorker(workerID, machineID, region, reason string) { ctx, cancel := context.WithTimeout(context.Background(), drainTimeout) defer cancel() + policy := drainPolicyForReason(reason) // Mark worker as draining so routing skips it s.registry.SetDraining(workerID, true) @@ -1141,7 +1180,6 @@ func (s *Scaler) drainWorker(workerID, machineID, region string) { } migrationFailures := 0 - const maxMigrationFailures = 3 // after 3 failed attempts, stop trying migration for { select { @@ -1197,14 +1235,14 @@ func (s *Scaler) drainWorker(workerID, machineID, region string) { // and a periodic reset lets us succeed once the transient clears. If // migration genuinely can't complete, drainTimeout terminates the loop // and the next eval tick takes over. - if migrationFailures >= maxMigrationFailures { - log.Printf("scaler: drain: %d migration failures on %s, backing off 5min before retry (%d sandboxes remaining)", - migrationFailures, workerID, len(running)) + if migrationFailures >= maxDrainMigrationFailures { + log.Printf("scaler: drain: %d migration failures on %s, backing off %s before retry (reason=%s, %d sandboxes remaining)", + migrationFailures, workerID, policy.backoff, policy.reason, len(running)) select { case <-ctx.Done(): log.Printf("scaler: drain: timeout reached for worker %s", workerID) return - case <-time.After(5 * time.Minute): + case <-time.After(policy.backoff): } migrationFailures = 0 continue @@ -1228,7 +1266,7 @@ func (s *Scaler) drainWorker(workerID, machineID, region string) { } } pending := len(s.state.GetPendingLaunches(region)) - if effective+pending < s.maxWorkers { + if s.pool != nil && effective+pending < s.maxWorkers { log.Printf("scaler: drain: no migration target for %s (%d sandboxes), triggering scale-up", workerID, len(running)) s.scaleUp(ctx, region) select { @@ -1238,6 +1276,11 @@ func (s *Scaler) drainWorker(workerID, machineID, region string) { } continue } + if s.pool == nil { + log.Printf("scaler: drain: no migration target for %s (%d sandboxes) and no compute pool configured, waiting for natural expiry", workerID, len(running)) + s.waitForNaturalDrain(ctx, workerID) + return + } log.Printf("scaler: drain: no migration target for %s (%d sandboxes) and at max workers (%d), waiting for natural expiry", workerID, len(running), s.maxWorkers) s.waitForNaturalDrain(ctx, workerID) return @@ -1271,7 +1314,7 @@ func (s *Scaler) drainWorker(workerID, machineID, region string) { } if batchFailed { - time.Sleep(drainBatchFailurePause) + time.Sleep(policy.failurePause) } else { time.Sleep(drainBatchSuccessPause) } diff --git a/internal/controlplane/scaler_test.go b/internal/controlplane/scaler_test.go index 86c39ba2..a04aa792 100644 --- a/internal/controlplane/scaler_test.go +++ b/internal/controlplane/scaler_test.go @@ -1788,6 +1788,27 @@ func TestInMemoryScalerState_Draining(t *testing.T) { } } +func TestDrainPolicyForReason(t *testing.T) { + normal := drainPolicyForReason(drainReasonScaleDown) + if normal.backoff != normalDrainBackoff { + t.Fatalf("expected normal drain backoff %s, got %s", normalDrainBackoff, normal.backoff) + } + if normal.failurePause != drainBatchFailurePause { + t.Fatalf("expected normal drain failure pause %s, got %s", drainBatchFailurePause, normal.failurePause) + } + + spot := drainPolicyForReason(drainReasonSpotPreemption) + if spot.backoff != spotDrainBackoff { + t.Fatalf("expected spot drain backoff %s, got %s", spotDrainBackoff, spot.backoff) + } + if spot.failurePause != spotDrainFailurePause { + t.Fatalf("expected spot drain failure pause %s, got %s", spotDrainFailurePause, spot.failurePause) + } + if spot.backoff >= normal.backoff { + t.Fatalf("expected spot drain backoff to be shorter than normal drain backoff") + } +} + func TestInMemoryScalerState_AllDraining(t *testing.T) { m := NewInMemoryScalerState() From 57ac20db38a7fc8600918696ac35970736a92ffd Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Tue, 2 Jun 2026 22:09:30 -0700 Subject: [PATCH 10/32] Improve spot drain migration handling --- internal/controlplane/redis_registry.go | 57 +++--- internal/controlplane/scaler.go | 83 +++++--- internal/controlplane/scaler_test.go | 36 +++- internal/controlplane/worker_registry.go | 33 ++-- internal/db/migration_pgfixture_test.go | 83 ++++++++ internal/db/store.go | 75 ++++---- scripts/test-spot-drain-health.sh | 232 +++++++++++++++++++++++ 7 files changed, 496 insertions(+), 103 deletions(-) create mode 100644 internal/db/migration_pgfixture_test.go create mode 100755 scripts/test-spot-drain-health.sh diff --git a/internal/controlplane/redis_registry.go b/internal/controlplane/redis_registry.go index 745fd765..30da6b4b 100644 --- a/internal/controlplane/redis_registry.go +++ b/internal/controlplane/redis_registry.go @@ -52,15 +52,15 @@ const ( // WorkerEntry represents a worker in the Redis-backed registry. type WorkerEntry struct { - ID string `json:"worker_id"` - MachineID string `json:"machine_id,omitempty"` // EC2 instance ID - Region string `json:"region"` - GRPCAddr string `json:"grpc_addr"` - HTTPAddr string `json:"http_addr"` - Capacity int `json:"capacity"` - Current int `json:"current"` - CPUPct float64 `json:"cpu_pct"` - MemPct float64 `json:"mem_pct"` + ID string `json:"worker_id"` + MachineID string `json:"machine_id,omitempty"` // EC2 instance ID + Region string `json:"region"` + GRPCAddr string `json:"grpc_addr"` + HTTPAddr string `json:"http_addr"` + Capacity int `json:"capacity"` + Current int `json:"current"` + CPUPct float64 `json:"cpu_pct"` + MemPct float64 `json:"mem_pct"` DiskPct float64 `json:"disk_pct"` TotalMemoryMB int `json:"total_memory_mb,omitempty"` CommittedMemoryMB int `json:"committed_memory_mb,omitempty"` @@ -88,13 +88,13 @@ type SandboxStats struct { // backed by Redis pub/sub for real-time updates and periodic SCAN for reconciliation. // It also maintains a persistent gRPC connection pool to workers. type RedisWorkerRegistry struct { - rdb *redis.Client - mu sync.RWMutex - workers map[string]*WorkerEntry // in-memory hot cache - conns map[string]*grpc.ClientConn // persistent gRPC connections - clients map[string]pb.SandboxWorkerClient // cached gRPC clients - rrCounter uint64 // round-robin counter for tie-breaking - stop chan struct{} + rdb *redis.Client + mu sync.RWMutex + workers map[string]*WorkerEntry // in-memory hot cache + conns map[string]*grpc.ClientConn // persistent gRPC connections + clients map[string]pb.SandboxWorkerClient // cached gRPC clients + rrCounter uint64 // round-robin counter for tie-breaking + stop chan struct{} // onWorkerRejoined fires when a worker registers — both genuinely new and // after being pruned for missed heartbeats. Used by the reconcile-on- @@ -751,20 +751,21 @@ func (r *RedisWorkerRegistry) GetWorkersByRegion(region string) []*WorkerInfo { for _, w := range r.workers { if w.Region == region { result = append(result, &WorkerInfo{ - ID: w.ID, - MachineID: w.MachineID, - Region: w.Region, - GRPCAddr: w.GRPCAddr, - HTTPAddr: w.HTTPAddr, - Capacity: w.Capacity, - Current: w.Current, - CPUPct: w.CPUPct, - MemPct: w.MemPct, - DiskPct: w.DiskPct, + ID: w.ID, + MachineID: w.MachineID, + Region: w.Region, + GRPCAddr: w.GRPCAddr, + HTTPAddr: w.HTTPAddr, + Capacity: w.Capacity, + Current: w.Current, + CPUPct: w.CPUPct, + MemPct: w.MemPct, + DiskPct: w.DiskPct, + Draining: w.Draining, TotalMemoryMB: w.TotalMemoryMB, CommittedMemoryMB: w.CommittedMemoryMB, - GoldenVersion: w.GoldenVersion, - WorkerVersion: w.WorkerVersion, + GoldenVersion: w.GoldenVersion, + WorkerVersion: w.WorkerVersion, }) } } diff --git a/internal/controlplane/scaler.go b/internal/controlplane/scaler.go index d80a3113..8c150fd6 100644 --- a/internal/controlplane/scaler.go +++ b/internal/controlplane/scaler.go @@ -48,6 +48,7 @@ const ( spotDrainFailurePause = 1 * time.Second // spot/preemption drains should keep moving inside the 2min notice window normalDrainBackoff = 5 * time.Minute // conservative retry backoff for ordinary scaler drains spotDrainBackoff = 1 * time.Second // aggressive retry backoff for spot/preemption drains + spotDrainConcurrency = 10 // rolling parallelism for urgent spot/preemption drains creationFailureThreshold = 3 // consecutive failures before exponential backoff creationBackoffMin = 1 * time.Minute // initial backoff after threshold hit @@ -131,6 +132,7 @@ type drainPolicy struct { reason string backoff time.Duration failurePause time.Duration + concurrency int } func drainPolicyForReason(reason string) drainPolicy { @@ -139,6 +141,7 @@ func drainPolicyForReason(reason string) drainPolicy { reason: reason, backoff: spotDrainBackoff, failurePause: spotDrainFailurePause, + concurrency: spotDrainConcurrency, } } if reason == "" { @@ -148,6 +151,7 @@ func drainPolicyForReason(reason string) drainPolicy { reason: reason, backoff: normalDrainBackoff, failurePause: drainBatchFailurePause, + concurrency: evacuationBatchSize, } } @@ -821,9 +825,13 @@ func (s *Scaler) findMigrationTarget(region, excludeWorkerID string, requiredMem if w.ID == excludeWorkerID { continue } - if s.state.IsDraining(w.MachineID) { + if w.Draining { continue } + if s.state.IsDraining(w.MachineID) { + log.Printf("scaler: migration target %s had stale drain state without live registry drain flag; clearing", w.ID) + s.state.RemoveDraining(w.MachineID) + } // Subtract in-flight migrations from remaining capacity pending := s.state.GetInFlight(w.ID) remaining := w.Capacity - w.Current - pending @@ -1168,6 +1176,9 @@ func (s *Scaler) drainWorker(workerID, machineID, region, reason string) { ctx, cancel := context.WithTimeout(context.Background(), drainTimeout) defer cancel() policy := drainPolicyForReason(reason) + if machineID != "" { + defer s.state.RemoveDraining(machineID) + } // Mark worker as draining so routing skips it s.registry.SetDraining(workerID, true) @@ -1286,31 +1297,13 @@ func (s *Scaler) drainWorker(workerID, machineID, region, reason string) { return } - // Migrate a batch — bounded parallelism to avoid overwhelming - // network/disk on source and target workers. Each sandbox picks its - // own target (based on its own memory footprint) inside liveMigrateSandbox. - batch := running - if len(batch) > evacuationBatchSize { - batch = batch[:evacuationBatchSize] - } - - batchFailed := false - var wg sync.WaitGroup - var failCount int64 - for _, sandboxID := range batch { - wg.Add(1) - go func(sbID string) { - defer wg.Done() - if err := s.liveMigrateSandbox(ctx, sbID, workerID, ""); err != nil { - log.Printf("scaler: drain: migrate %s failed: %v", sbID, err) - atomic.AddInt64(&failCount, 1) - } - }(sandboxID) - } - wg.Wait() + // Migrate with bounded rolling parallelism. A slow sandbox occupies one + // worker slot, but does not block the remaining pending sandboxes from + // starting as other slots complete. + failCount := s.drainMigrateSandboxes(ctx, workerID, running, policy.concurrency) + batchFailed := failCount > 0 if failCount > 0 { migrationFailures += int(failCount) - batchFailed = true } if batchFailed { @@ -1321,6 +1314,46 @@ func (s *Scaler) drainWorker(workerID, machineID, region, reason string) { } } +func (s *Scaler) drainMigrateSandboxes(ctx context.Context, workerID string, sandboxIDs []string, concurrency int) int64 { + if concurrency <= 0 { + concurrency = evacuationBatchSize + } + if concurrency > len(sandboxIDs) { + concurrency = len(sandboxIDs) + } + + jobs := make(chan string) + var wg sync.WaitGroup + var failCount int64 + + for i := 0; i < concurrency; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for sandboxID := range jobs { + if err := s.liveMigrateSandbox(ctx, sandboxID, workerID, ""); err != nil { + log.Printf("scaler: drain: migrate %s failed: %v", sandboxID, err) + atomic.AddInt64(&failCount, 1) + } + } + }() + } + + for _, sandboxID := range sandboxIDs { + select { + case <-ctx.Done(): + close(jobs) + wg.Wait() + return failCount + case jobs <- sandboxID: + } + } + close(jobs) + wg.Wait() + + return failCount +} + // waitForNaturalDrain polls until the worker has 0 sandboxes or the context expires. // Sandboxes expire via timeout or user-initiated destroy. func (s *Scaler) waitForNaturalDrain(ctx context.Context, workerID string) { @@ -1619,7 +1652,7 @@ func (s *Scaler) liveMigrateSandbox(ctx context.Context, sandboxID, sourceWorker // no QEMU, drain's ListSandboxes returns 0 and exits cleanly leaving // the row stuck. Mark error instead so the sandbox is visibly broken. if qmpSucceeded { - if err := s.store.FailMigrationPostQMP(ctx, sandboxID, "migration failed after QMP transfer; source VM gone, target failed to complete"); err != nil { + if err := s.store.FailMigrationPostQMP(ctx, sandboxID, db.PostQMPMigrationFailureMessage); err != nil { log.Printf("scaler: migrate %s: FailMigrationPostQMP failed: %v", sandboxID, err) } // Source VM is gone, target failed. Emit a `stopped` event so diff --git a/internal/controlplane/scaler_test.go b/internal/controlplane/scaler_test.go index a04aa792..a192a613 100644 --- a/internal/controlplane/scaler_test.go +++ b/internal/controlplane/scaler_test.go @@ -718,7 +718,7 @@ func TestFindMigrationTargetSkipsDraining(t *testing.T) { }) reg.addWorker(&WorkerInfo{ ID: "draining", MachineID: "osb-worker-draining", Region: "us-east-1", - Capacity: 50, Current: 5, CPUPct: 10, MemPct: 10, DiskPct: 10, + Capacity: 50, Current: 5, CPUPct: 10, MemPct: 10, DiskPct: 10, Draining: true, }) s := newTestScaler(reg, pool) @@ -730,6 +730,31 @@ func TestFindMigrationTargetSkipsDraining(t *testing.T) { } } +func TestFindMigrationTargetClearsStaleDrainState(t *testing.T) { + reg := newMockRegistry() + pool := newMockPool() + + reg.addWorker(&WorkerInfo{ + ID: "hot", MachineID: "osb-worker-hot", Region: "us-east-1", + Capacity: 50, Current: 45, CPUPct: 85, MemPct: 50, DiskPct: 30, + }) + reg.addWorker(&WorkerInfo{ + ID: "candidate", MachineID: "osb-worker-candidate", Region: "us-east-1", + Capacity: 50, Current: 5, CPUPct: 10, MemPct: 10, DiskPct: 10, + }) + + s := newTestScaler(reg, pool) + s.state.SetDraining("osb-worker-candidate", &drainState{WorkerID: "candidate"}) + + target := s.findMigrationTarget("us-east-1", "hot", 0) + if target == nil || target.ID != "candidate" { + t.Fatalf("expected stale-drain candidate to be usable, got %#v", target) + } + if s.state.IsDraining("osb-worker-candidate") { + t.Fatalf("expected stale drain state to be cleared") + } +} + func TestFindMigrationTargetAccountsForInFlight(t *testing.T) { reg := newMockRegistry() pool := newMockPool() @@ -1796,6 +1821,9 @@ func TestDrainPolicyForReason(t *testing.T) { if normal.failurePause != drainBatchFailurePause { t.Fatalf("expected normal drain failure pause %s, got %s", drainBatchFailurePause, normal.failurePause) } + if normal.concurrency != evacuationBatchSize { + t.Fatalf("expected normal drain concurrency %d, got %d", evacuationBatchSize, normal.concurrency) + } spot := drainPolicyForReason(drainReasonSpotPreemption) if spot.backoff != spotDrainBackoff { @@ -1804,9 +1832,15 @@ func TestDrainPolicyForReason(t *testing.T) { if spot.failurePause != spotDrainFailurePause { t.Fatalf("expected spot drain failure pause %s, got %s", spotDrainFailurePause, spot.failurePause) } + if spot.concurrency != spotDrainConcurrency { + t.Fatalf("expected spot drain concurrency %d, got %d", spotDrainConcurrency, spot.concurrency) + } if spot.backoff >= normal.backoff { t.Fatalf("expected spot drain backoff to be shorter than normal drain backoff") } + if spot.concurrency <= normal.concurrency { + t.Fatalf("expected spot drain concurrency to be higher than normal drain concurrency") + } } func TestInMemoryScalerState_AllDraining(t *testing.T) { diff --git a/internal/controlplane/worker_registry.go b/internal/controlplane/worker_registry.go index 7dc9e77f..bb544894 100644 --- a/internal/controlplane/worker_registry.go +++ b/internal/controlplane/worker_registry.go @@ -13,22 +13,23 @@ import ( // WorkerInfo represents a registered worker. type WorkerInfo struct { - ID string `json:"worker_id"` - MachineID string `json:"machine_id,omitempty"` // EC2 instance ID - Region string `json:"region"` - GRPCAddr string `json:"grpc_addr"` - HTTPAddr string `json:"http_addr"` - Capacity int `json:"capacity"` - Current int `json:"current"` - CPUPct float64 `json:"cpu_pct"` - MemPct float64 `json:"mem_pct"` - DiskPct float64 `json:"disk_pct"` - TotalMemoryMB int `json:"total_memory_mb,omitempty"` - CommittedMemoryMB int `json:"committed_memory_mb,omitempty"` - GoldenVersion string `json:"golden_version,omitempty"` - WorkerVersion string `json:"worker_version,omitempty"` - LastSeen time.Time `json:"-"` - MissedBeats int `json:"-"` + ID string `json:"worker_id"` + MachineID string `json:"machine_id,omitempty"` // EC2 instance ID + Region string `json:"region"` + GRPCAddr string `json:"grpc_addr"` + HTTPAddr string `json:"http_addr"` + Capacity int `json:"capacity"` + Current int `json:"current"` + CPUPct float64 `json:"cpu_pct"` + MemPct float64 `json:"mem_pct"` + DiskPct float64 `json:"disk_pct"` + Draining bool `json:"draining,omitempty"` + TotalMemoryMB int `json:"total_memory_mb,omitempty"` + CommittedMemoryMB int `json:"committed_memory_mb,omitempty"` + GoldenVersion string `json:"golden_version,omitempty"` + WorkerVersion string `json:"worker_version,omitempty"` + LastSeen time.Time `json:"-"` + MissedBeats int `json:"-"` } // WorkerRegistry tracks live workers from NATS heartbeats. diff --git a/internal/db/migration_pgfixture_test.go b/internal/db/migration_pgfixture_test.go new file mode 100644 index 00000000..3810d9ba --- /dev/null +++ b/internal/db/migration_pgfixture_test.go @@ -0,0 +1,83 @@ +//go:build pgfixture + +package db + +import ( + "context" + "encoding/json" + "testing" +) + +func TestCompleteMigrationRecoversPostQMPError_pgfixture(t *testing.T) { + ctx := context.Background() + store := openPgStore(t) + orgID := seedOrgWithCap(t, store, 16) + sandboxID := freshSandboxID("recover-post-qmp") + + if _, err := store.CreateSandboxSession(ctx, sandboxID, orgID, nil, "default", "us-east-2", "source-worker", json.RawMessage(`{}`), json.RawMessage(`{}`), nil); err != nil { + t.Fatalf("create sandbox session: %v", err) + } + if err := store.SetMigrating(ctx, sandboxID, "first-target"); err != nil { + t.Fatalf("set migrating: %v", err) + } + if err := store.FailMigrationPostQMP(ctx, sandboxID, PostQMPMigrationFailureMessage); err != nil { + t.Fatalf("fail post-qmp: %v", err) + } + + if err := store.CompleteMigration(ctx, sandboxID, "second-target"); err != nil { + t.Fatalf("complete migration should recover post-QMP error: %v", err) + } + + sess, err := store.GetSandboxSession(ctx, sandboxID) + if err != nil { + t.Fatalf("get sandbox session: %v", err) + } + if sess.Status != "running" { + t.Fatalf("expected running, got %q", sess.Status) + } + if sess.WorkerID != "second-target" { + t.Fatalf("expected worker second-target, got %q", sess.WorkerID) + } + if sess.MigratingToWorker != "" { + t.Fatalf("expected migrating_to_worker cleared, got %q", sess.MigratingToWorker) + } + if sess.ErrorMsg != nil { + t.Fatalf("expected error_msg cleared, got %q", *sess.ErrorMsg) + } + if sess.StoppedAt != nil { + t.Fatalf("expected stopped_at cleared, got %s", sess.StoppedAt) + } +} + +func TestCompleteMigrationDoesNotRecoverUnrelatedError_pgfixture(t *testing.T) { + ctx := context.Background() + store := openPgStore(t) + orgID := seedOrgWithCap(t, store, 16) + sandboxID := freshSandboxID("unrelated-error") + + if _, err := store.CreateSandboxSession(ctx, sandboxID, orgID, nil, "default", "us-east-2", "source-worker", json.RawMessage(`{}`), json.RawMessage(`{}`), nil); err != nil { + t.Fatalf("create sandbox session: %v", err) + } + errMsg := "some unrelated failure" + if err := store.UpdateSandboxSessionStatus(ctx, sandboxID, "error", &errMsg); err != nil { + t.Fatalf("mark unrelated error: %v", err) + } + + if err := store.CompleteMigration(ctx, sandboxID, "second-target"); err == nil { + t.Fatalf("expected unrelated error row not to recover") + } + + sess, err := store.GetSandboxSession(ctx, sandboxID) + if err != nil { + t.Fatalf("get sandbox session: %v", err) + } + if sess.Status != "error" { + t.Fatalf("expected error status preserved, got %q", sess.Status) + } + if sess.WorkerID != "source-worker" { + t.Fatalf("expected worker source-worker preserved, got %q", sess.WorkerID) + } + if sess.ErrorMsg == nil || *sess.ErrorMsg != errMsg { + t.Fatalf("expected unrelated error message preserved, got %#v", sess.ErrorMsg) + } +} diff --git a/internal/db/store.go b/internal/db/store.go index 8586c9c6..2d5de90a 100644 --- a/internal/db/store.go +++ b/internal/db/store.go @@ -187,14 +187,14 @@ type Org struct { UpdatedAt time.Time `json:"updatedAt"` // Custom domain fields - CustomDomain *string `json:"customDomain,omitempty"` - CFHostnameID *string `json:"cfHostnameId,omitempty"` - DomainVerificationStatus string `json:"domainVerificationStatus"` - DomainSSLStatus string `json:"domainSslStatus"` - VerificationTxtName *string `json:"verificationTxtName,omitempty"` - VerificationTxtValue *string `json:"verificationTxtValue,omitempty"` - SSLTxtName *string `json:"sslTxtName,omitempty"` - SSLTxtValue *string `json:"sslTxtValue,omitempty"` + CustomDomain *string `json:"customDomain,omitempty"` + CFHostnameID *string `json:"cfHostnameId,omitempty"` + DomainVerificationStatus string `json:"domainVerificationStatus"` + DomainSSLStatus string `json:"domainSslStatus"` + VerificationTxtName *string `json:"verificationTxtName,omitempty"` + VerificationTxtValue *string `json:"verificationTxtValue,omitempty"` + SSLTxtName *string `json:"sslTxtName,omitempty"` + SSLTxtValue *string `json:"sslTxtValue,omitempty"` // WorkOS organization fields WorkOSOrgID *string `json:"workosOrgId,omitempty"` @@ -208,10 +208,10 @@ type Org struct { FreeCreditsRemainingCents int64 `json:"freeCreditsRemainingCents"` // Stripe billing fields - StripeCustomerID *string `json:"stripeCustomerId,omitempty"` - StripeSubscriptionID *string `json:"stripeSubscriptionId,omitempty"` - LastUsageReportedAt time.Time `json:"lastUsageReportedAt"` - PriceLocked bool `json:"priceLocked"` + StripeCustomerID *string `json:"stripeCustomerId,omitempty"` + StripeSubscriptionID *string `json:"stripeSubscriptionId,omitempty"` + LastUsageReportedAt time.Time `json:"lastUsageReportedAt"` + PriceLocked bool `json:"priceLocked"` // Per-org billing pipeline selector. 'legacy' = UsageReporter ships // to Stripe; 'unified' = the phase-3 sender ships from @@ -630,25 +630,27 @@ func (s *Store) DeleteAPIKeyForOrg(ctx context.Context, id uuid.UUID, orgID uuid // --- Sandbox Session operations --- type SandboxSession struct { - ID uuid.UUID `json:"id"` - SandboxID string `json:"sandboxId"` - OrgID uuid.UUID `json:"orgId"` - UserID *uuid.UUID `json:"userId,omitempty"` - Template string `json:"template"` - Region string `json:"region"` - WorkerID string `json:"workerId"` - Status string `json:"status"` - Config json.RawMessage `json:"config"` - Metadata json.RawMessage `json:"metadata,omitempty"` - StartedAt time.Time `json:"startedAt"` - StoppedAt *time.Time `json:"stoppedAt,omitempty"` - ErrorMsg *string `json:"errorMsg,omitempty"` - BasedOnCheckpointID *uuid.UUID `json:"basedOnCheckpointId,omitempty"` - LastPatchSequence int `json:"lastPatchSequence"` - MigratingToWorker string `json:"migratingToWorker,omitempty"` - PatchError *string `json:"patchError,omitempty"` - GoldenVersion *string `json:"goldenVersion,omitempty"` -} + ID uuid.UUID `json:"id"` + SandboxID string `json:"sandboxId"` + OrgID uuid.UUID `json:"orgId"` + UserID *uuid.UUID `json:"userId,omitempty"` + Template string `json:"template"` + Region string `json:"region"` + WorkerID string `json:"workerId"` + Status string `json:"status"` + Config json.RawMessage `json:"config"` + Metadata json.RawMessage `json:"metadata,omitempty"` + StartedAt time.Time `json:"startedAt"` + StoppedAt *time.Time `json:"stoppedAt,omitempty"` + ErrorMsg *string `json:"errorMsg,omitempty"` + BasedOnCheckpointID *uuid.UUID `json:"basedOnCheckpointId,omitempty"` + LastPatchSequence int `json:"lastPatchSequence"` + MigratingToWorker string `json:"migratingToWorker,omitempty"` + PatchError *string `json:"patchError,omitempty"` + GoldenVersion *string `json:"goldenVersion,omitempty"` +} + +const PostQMPMigrationFailureMessage = "migration failed after QMP transfer; source VM gone, target failed to complete" func (s *Store) CreateSandboxSession(ctx context.Context, sandboxID string, orgID uuid.UUID, userID *uuid.UUID, template, region, workerID string, config, metadata json.RawMessage, secretStoreID *uuid.UUID) (*SandboxSession, error) { return s.CreateSandboxSessionWithStatus(ctx, sandboxID, orgID, userID, template, region, workerID, config, metadata, "running", secretStoreID) @@ -741,10 +743,17 @@ func (s *Store) CompleteMigration(ctx context.Context, sandboxID, newWorkerID st // Use status IN ('migrating', 'running', 'stopped') — a race with the source // worker's cleanup may have already set it to 'stopped' or reverted to 'running'. // The migration DID succeed (QEMU is running on the target), so force the update. + // Also recover the specific post-QMP error state: an earlier target-complete + // timeout may have marked the row error even though the target VM later became + // usable and participated in a subsequent successful migration. tag, err := s.pool.Exec(ctx, `UPDATE sandbox_sessions SET status = 'running', worker_id = $1, migrating_to_worker = '', stopped_at = NULL, error_msg = NULL - WHERE sandbox_id = $2 AND status IN ('migrating', 'running', 'stopped')`, - newWorkerID, sandboxID) + WHERE sandbox_id = $2 + AND ( + status IN ('migrating', 'running', 'stopped') + OR (status = 'error' AND error_msg = $3) + )`, + newWorkerID, sandboxID, PostQMPMigrationFailureMessage) if err != nil { return err } diff --git a/scripts/test-spot-drain-health.sh b/scripts/test-spot-drain-health.sh new file mode 100755 index 00000000..e5f54de8 --- /dev/null +++ b/scripts/test-spot-drain-health.sh @@ -0,0 +1,232 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Live-migration drain health test for spot/preemption drills. +# +# Required env: +# OPENSANDBOX_API_URL e.g. https://spot-poc.opencomputer.app or http://127.0.0.1:8080 +# OPENSANDBOX_API_KEY +# SOURCE_WORKER_ID worker to evacuate +# TARGET_WORKER_ID spare worker expected to receive the sandboxes +# +# Optional env: +# COUNT number of sandboxes to create (default: 40) +# TEMPLATE sandbox template (default: default) +# MEMORY_MB sandbox memory (default: 1024) +# CPU_COUNT sandbox vCPU count (default: 1) +# DISK_MB sandbox disk MB (default: 20480) +# POLL_SECONDS drain poll interval (default: 2) +# DRAIN_TIMEOUT_SECONDS drain timeout (default: 600) +# KEEP_SANDBOXES set to 1 to leave sandboxes running on exit (default: 0) +# FORCE_SOURCE_PLACEMENT set to 0 to skip temporarily draining target during create (default: 1) + +API_URL="${OPENSANDBOX_API_URL:?OPENSANDBOX_API_URL is required}" +API_KEY="${OPENSANDBOX_API_KEY:?OPENSANDBOX_API_KEY is required}" +SOURCE_WORKER_ID="${SOURCE_WORKER_ID:?SOURCE_WORKER_ID is required}" +TARGET_WORKER_ID="${TARGET_WORKER_ID:?TARGET_WORKER_ID is required}" + +COUNT="${COUNT:-40}" +TEMPLATE="${TEMPLATE:-default}" +MEMORY_MB="${MEMORY_MB:-1024}" +CPU_COUNT="${CPU_COUNT:-1}" +DISK_MB="${DISK_MB:-20480}" +POLL_SECONDS="${POLL_SECONDS:-2}" +DRAIN_TIMEOUT_SECONDS="${DRAIN_TIMEOUT_SECONDS:-600}" +KEEP_SANDBOXES="${KEEP_SANDBOXES:-0}" +FORCE_SOURCE_PLACEMENT="${FORCE_SOURCE_PLACEMENT:-1}" + +AUTH_HEADER="X-API-Key: ${API_KEY}" +CONTENT_HEADER="Content-Type: application/json" +SANDBOX_FILE="$(mktemp -t spot-drain-sandboxes.XXXXXX)" +FAIL_FILE="$(mktemp -t spot-drain-failures.XXXXXX)" + +cleanup() { + api POST "/admin/workers/${SOURCE_WORKER_ID}/drain?drain=false" >/dev/null 2>&1 || true + api POST "/admin/workers/${TARGET_WORKER_ID}/drain?drain=false" >/dev/null 2>&1 || true + + if [[ "$KEEP_SANDBOXES" == "1" ]]; then + echo "Leaving sandboxes running. IDs: $SANDBOX_FILE" + return + fi + + if [[ -s "$SANDBOX_FILE" ]]; then + echo "Cleaning up $(wc -l < "$SANDBOX_FILE" | tr -d ' ') sandboxes..." + while read -r sandbox_id; do + [[ -n "$sandbox_id" ]] || continue + curl -fsS -X DELETE -H "$AUTH_HEADER" "${API_URL}/api/sandboxes/${sandbox_id}" >/dev/null || true + done < "$SANDBOX_FILE" + fi +} +trap cleanup EXIT + +api() { + local method="$1" + local path="$2" + local body="${3:-}" + + if [[ -n "$body" ]]; then + curl -fsS -X "$method" -H "$AUTH_HEADER" -H "$CONTENT_HEADER" -d "$body" "${API_URL}${path}" + else + curl -fsS -X "$method" -H "$AUTH_HEADER" "${API_URL}${path}" + fi +} + +json_value() { + local expr="$1" + python3 -c " +import json, sys +try: + data = json.load(sys.stdin) + value = ${expr} + print('' if value is None else value) +except Exception: + print('') +" +} + +tracked_counts() { + local source_count=0 target_count=0 other_count=0 non_running=0 total=0 + local sandbox_id meta status worker + + while read -r sandbox_id; do + [[ -n "$sandbox_id" ]] || continue + total=$((total + 1)) + meta=$(api GET "/api/sandboxes/${sandbox_id}" || echo '{}') + status=$(printf "%s" "$meta" | json_value 'data.get("status")') + worker=$(printf "%s" "$meta" | json_value 'data.get("workerID")') + + if [[ "$status" != "running" ]]; then + non_running=$((non_running + 1)) + elif [[ "$worker" == "$SOURCE_WORKER_ID" ]]; then + source_count=$((source_count + 1)) + elif [[ "$worker" == "$TARGET_WORKER_ID" ]]; then + target_count=$((target_count + 1)) + else + other_count=$((other_count + 1)) + fi + done < "$SANDBOX_FILE" + + echo "source=$source_count target=$target_count other=$other_count non_running=$non_running total=$total" +} + +create_sandbox() { + local body response sandbox_id + body=$(printf '{"template":"%s","memoryMB":%s,"cpuCount":%s,"diskMB":%s}' \ + "$TEMPLATE" "$MEMORY_MB" "$CPU_COUNT" "$DISK_MB") + response=$(api POST /api/sandboxes "$body") + sandbox_id=$(printf "%s" "$response" | json_value 'data.get("sandboxID") or data.get("id")') + if [[ -z "$sandbox_id" ]]; then + echo "create failed: $response" >&2 + return 1 + fi + echo "$sandbox_id" +} + +exec_ok() { + local sandbox_id="$1" + local response exit_code + response=$(api POST "/api/sandboxes/${sandbox_id}/exec/run" \ + '{"cmd":"/bin/true","args":[],"timeout":5}' || true) + exit_code=$(printf "%s" "$response" | json_value 'data.get("exitCode")') + [[ "$exit_code" == "0" ]] +} + +pty_ok() { + local sandbox_id="$1" + local response session_id + response=$(api POST "/api/sandboxes/${sandbox_id}/pty" \ + '{"cols":80,"rows":24,"shell":"/bin/bash"}' || true) + session_id=$(printf "%s" "$response" | json_value 'data.get("sessionID")') + if [[ -z "$session_id" ]]; then + return 1 + fi + api DELETE "/api/sandboxes/${sandbox_id}/pty/${session_id}" >/dev/null || true +} + +worker_of() { + local sandbox_id="$1" + api GET "/api/sandboxes/${sandbox_id}" | json_value 'data.get("workerID")' +} + +health_pass() { + local label="$1" + local failures=0 total=0 worker exec_status pty_status + : > "$FAIL_FILE" + + echo "Health pass: $label" + while read -r sandbox_id; do + [[ -n "$sandbox_id" ]] || continue + total=$((total + 1)) + worker=$(worker_of "$sandbox_id") + + exec_status=ok + if ! exec_ok "$sandbox_id"; then + exec_status=fail + fi + + pty_status=ok + if ! pty_ok "$sandbox_id"; then + pty_status=fail + fi + + printf "%s worker=%s exec=%s pty=%s\n" "$sandbox_id" "$worker" "$exec_status" "$pty_status" + if [[ "$exec_status" != "ok" || "$pty_status" != "ok" ]]; then + failures=$((failures + 1)) + printf "%s worker=%s exec=%s pty=%s\n" "$sandbox_id" "$worker" "$exec_status" "$pty_status" >> "$FAIL_FILE" + fi + done < "$SANDBOX_FILE" + + echo "Health summary: label=$label total=$total failures=$failures" + if [[ "$failures" -ne 0 ]]; then + cat "$FAIL_FILE" >&2 + return 1 + fi +} + +echo "Clearing drain markers on source and target workers..." +api POST "/admin/workers/${SOURCE_WORKER_ID}/drain?drain=false" >/dev/null || true +api POST "/admin/workers/${TARGET_WORKER_ID}/drain?drain=false" >/dev/null || true + +if [[ "$FORCE_SOURCE_PLACEMENT" == "1" ]]; then + echo "Temporarily draining target worker during create: $TARGET_WORKER_ID" + api POST "/admin/workers/${TARGET_WORKER_ID}/drain" >/dev/null +fi + +echo "Creating $COUNT sandboxes..." +for i in $(seq 1 "$COUNT"); do + sandbox_id=$(create_sandbox) + echo "$sandbox_id" | tee -a "$SANDBOX_FILE" + sleep 0.3 +done + +if [[ "$FORCE_SOURCE_PLACEMENT" == "1" ]]; then + echo "Clearing target worker drain marker before evacuation: $TARGET_WORKER_ID" + api POST "/admin/workers/${TARGET_WORKER_ID}/drain?drain=false" >/dev/null +fi + +echo "Initial tracked counts: $(tracked_counts)" +health_pass before-drain + +echo "Evacuating source worker: $SOURCE_WORKER_ID" +api POST "/admin/workers/${SOURCE_WORKER_ID}/evacuate" >/dev/null + +start_epoch=$(date +%s) +while true; do + counts=$(tracked_counts) + now=$(date -u "+%Y-%m-%d %H:%M:%S UTC") + elapsed=$(( $(date +%s) - start_epoch )) + echo "$now elapsed=${elapsed}s $counts" + + if [[ "$counts" == source=0* ]]; then + break + fi + if [[ "$elapsed" -gt "$DRAIN_TIMEOUT_SECONDS" ]]; then + echo "Drain timed out after ${elapsed}s" >&2 + exit 1 + fi + sleep "$POLL_SECONDS" +done + +echo "Final tracked counts: $(tracked_counts)" +health_pass after-drain +echo "PASS: all $COUNT sandboxes retained exec and PTY after drain" From 25ac2e193a737a6544ae0fb7f8ab3934456636a0 Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Tue, 2 Jun 2026 22:29:13 -0700 Subject: [PATCH 11/32] Add alpha spot sandbox family --- cloudflare-workers/api-edge/src/index.ts | 43 ++++++++- .../api-edge/wrangler.prod.toml | 2 + cloudflare-workers/api-edge/wrangler.toml | 2 + internal/api/internal_sandbox.go | 5 + internal/api/sandbox.go | 50 ++++++++-- internal/api/sandbox_autoscale.go | 9 ++ pkg/types/sandbox.go | 92 ++++++++++++++----- pkg/types/sandbox_family_test.go | 53 +++++++++++ sdks/python/opencomputer/__init__.py | 3 +- sdks/python/opencomputer/sandbox.py | 16 ++++ sdks/typescript/src/index.ts | 1 + sdks/typescript/src/sandbox.ts | 19 ++++ 12 files changed, 261 insertions(+), 34 deletions(-) create mode 100644 pkg/types/sandbox_family_test.go diff --git a/cloudflare-workers/api-edge/src/index.ts b/cloudflare-workers/api-edge/src/index.ts index d3193daa..ae6cbfc5 100644 --- a/cloudflare-workers/api-edge/src/index.ts +++ b/cloudflare-workers/api-edge/src/index.ts @@ -33,6 +33,9 @@ export interface Env extends DashboardEnv { // CF_API_TOKEN and CF_ZONE_ID are optional in DashboardEnv (custom domain // feature gates on them). Inherited. ASSETS?: Fetcher; + // Optional alpha spot-only cell. When unset, sandboxFamily="spot" creates + // fail closed rather than silently landing on on-demand capacity. + SPOT_CELL_ID?: string; } // ── small helpers ──────────────────────────────────────────────────────── @@ -440,18 +443,54 @@ async function createSandbox(req: Request, env: Env): Promise { // Read body once — used for size-gating, the hard-pin cell peek, and the // verbatim forward to the CP. - const bodyText = await req.text(); + let bodyText = await req.text(); let requestedCellID: string | null = null; let bodyCpuCount = 0; let bodyMemoryMB = 0; let bodyDiskMB = 0; + let sandboxFamily = ""; try { if (bodyText) { - const parsed = JSON.parse(bodyText) as { cellId?: unknown; cpuCount?: unknown; memoryMB?: unknown; diskMB?: unknown }; + const parsed = JSON.parse(bodyText) as { + cellId?: unknown; + cpuCount?: unknown; + memoryMB?: unknown; + diskMB?: unknown; + sandboxFamily?: unknown; + image?: unknown; + snapshot?: unknown; + }; if (typeof parsed.cellId === "string") requestedCellID = parsed.cellId; if (typeof parsed.cpuCount === "number") bodyCpuCount = parsed.cpuCount; if (typeof parsed.memoryMB === "number") bodyMemoryMB = parsed.memoryMB; if (typeof parsed.diskMB === "number") bodyDiskMB = parsed.diskMB; + if (typeof parsed.sandboxFamily === "string") sandboxFamily = parsed.sandboxFamily; + + if (sandboxFamily === "default") sandboxFamily = ""; + if (sandboxFamily && sandboxFamily !== "spot") { + return json({ error: `unsupported sandboxFamily ${sandboxFamily}` }, 400); + } + if (sandboxFamily === "spot") { + if (!env.SPOT_CELL_ID) { + return json({ error: "spot sandbox alpha is not configured" }, 503); + } + if (requestedCellID && requestedCellID !== env.SPOT_CELL_ID) { + return json({ error: "sandboxFamily spot cannot be combined with a different cellId" }, 400); + } + if (parsed.image != null || parsed.snapshot != null) { + return json({ error: "sandboxFamily spot does not support image or snapshot creates in alpha" }, 400); + } + if ((bodyCpuCount && bodyCpuCount !== 1) || (bodyMemoryMB && bodyMemoryMB !== 1024)) { + return json({ error: "sandboxFamily spot is currently limited to 1 vCPU and 1024 MB memory" }, 400); + } + parsed.cpuCount = 1; + parsed.memoryMB = 1024; + parsed.sandboxFamily = "spot"; + requestedCellID = env.SPOT_CELL_ID; + bodyCpuCount = 1; + bodyMemoryMB = 1024; + bodyText = JSON.stringify(parsed); + } } } catch { /* malformed JSON — let the CP reject with a proper 400 */ diff --git a/cloudflare-workers/api-edge/wrangler.prod.toml b/cloudflare-workers/api-edge/wrangler.prod.toml index 32bfc6dc..39316060 100644 --- a/cloudflare-workers/api-edge/wrangler.prod.toml +++ b/cloudflare-workers/api-edge/wrangler.prod.toml @@ -62,6 +62,8 @@ WORKER_ENV = "prod" # No prod cells registered yet. Populate when prod CPs come online and # their rows are written to the D1 `cells` table. CELLS = "" +# Optional alpha route for sandboxFamily="spot". Leave empty to disable. +SPOT_CELL_ID = "" # Ship Worker logs to opencomputer-log-tail-prod → Axiom dataset cf-prod. [[tail_consumers]] diff --git a/cloudflare-workers/api-edge/wrangler.toml b/cloudflare-workers/api-edge/wrangler.toml index f5d32907..e5524569 100644 --- a/cloudflare-workers/api-edge/wrangler.toml +++ b/cloudflare-workers/api-edge/wrangler.toml @@ -84,6 +84,8 @@ WORKER_ENV = "dev" # the D1 `cells` table (see schema.sql + scripts/seed_dev_xcell.sh). # Cross-cell testbed: dev2 (Azure westus2) + dev3 (AWS us-east-1). CELLS = "azure-us-west-2-b,aws-us-east-1-a" +# Optional alpha route for sandboxFamily="spot". Leave empty to disable. +SPOT_CELL_ID = "" # Ship console output + exceptions to opensandbox-log-tail, which forwards # to Axiom dataset cf-dev. Keeps CF Worker logs durable in the same store diff --git a/internal/api/internal_sandbox.go b/internal/api/internal_sandbox.go index 407e2314..70cd1ffe 100644 --- a/internal/api/internal_sandbox.go +++ b/internal/api/internal_sandbox.go @@ -113,7 +113,12 @@ func (s *Server) internalCreateSandbox(c echo.Context) error { // so forks of that sandbox inherited a no-network config. cfg.EnsureNetworkEnabledDefault() + if err := types.ApplySandboxFamilyDefaultsAndValidate(&cfg); err != nil { + return c.JSON(http.StatusBadRequest, map[string]string{"error": err.Error()}) + } + // Same memory/cpu defaults the public POST /api/sandboxes applies. + // Sandbox-family defaults are applied above and should not be overwritten. if cfg.MemoryMB == 0 { cfg.MemoryMB = 4096 cfg.CpuCount = 1 diff --git a/internal/api/sandbox.go b/internal/api/sandbox.go index dbd4b400..99af005b 100644 --- a/internal/api/sandbox.go +++ b/internal/api/sandbox.go @@ -32,6 +32,12 @@ func (s *Server) createSandbox(c echo.Context) error { // to sandbox_sessions.config_json is explicit and forks inherit it correctly. cfg.EnsureNetworkEnabledDefault() + if err := types.ApplySandboxFamilyDefaultsAndValidate(&cfg); err != nil { + return c.JSON(http.StatusBadRequest, map[string]string{ + "error": err.Error(), + }) + } + // Validate CPU/memory against allowed tiers. // Allowed tiers (memoryMB → vCPU): 1024→1, 4096→1, 8192→2, 16384→4, 32768→8, 65536→16. if err := types.ValidateResourceTier(&cfg); err != nil { @@ -71,7 +77,8 @@ func (s *Server) createSandbox(c echo.Context) error { } } - // Default to 4GB/1vCPU if not specified (all plans) + // Default to 4GB/1vCPU if not specified (all plans). Sandbox-family + // defaults are applied above and should not be overwritten here. if cfg.MemoryMB == 0 { cfg.MemoryMB = 4096 cfg.CpuCount = 1 @@ -177,6 +184,7 @@ func (s *Server) createSandbox(c echo.Context) error { "error": err.Error(), }) } + sb.SandboxFamily = cfg.SandboxFamily // Register with sandbox router for rolling timeout tracking. // timeout == 0 means "persistent" (no auto-hibernate). Negative values are @@ -669,11 +677,14 @@ func (s *Server) createSandboxRemote(c echo.Context, ctx context.Context, cfg ty s.emitEvent("create", grpcResp.SandboxId, worker.ID, fmt.Sprintf("created on %s", worker.ID[len(worker.ID)-8:])) resp := map[string]interface{}{ - "sandboxID": grpcResp.SandboxId, - "token": token, - "status": grpcResp.Status, - "region": region, - "workerID": worker.ID, + "sandboxID": grpcResp.SandboxId, + "token": token, + "status": grpcResp.Status, + "region": region, + "workerID": worker.ID, + "cpuCount": cfg.CpuCount, + "memoryMB": cfg.MemoryMB, + "sandboxFamily": cfg.SandboxFamily, } if s.sandboxDomain != "" { resp["sandboxDomain"] = s.sandboxDomain @@ -1184,6 +1195,21 @@ func (s *Server) effectivePlan(c echo.Context, orgID uuid.UUID) string { return "" } +func sandboxSessionFamily(session *db.SandboxSession) string { + if session == nil || len(session.Config) == 0 { + return types.SandboxFamilyDefault + } + var cfg types.SandboxConfig + if err := json.Unmarshal(session.Config, &cfg); err != nil { + return types.SandboxFamilyDefault + } + return cfg.SandboxFamily +} + +func isSpotSandboxSession(session *db.SandboxSession) bool { + return sandboxSessionFamily(session) == types.SandboxFamilySpot +} + func (s *Server) setLimits(c echo.Context) error { id := c.Param("id") ctx := c.Request().Context() @@ -1279,6 +1305,12 @@ func (s *Server) scaleSandbox(c echo.Context) error { // resources. Same code that the autoscale endpoint and the autoscaler // loop use, so SDK consumers can branch on a single error code. if s.store != nil { + if session, err := s.store.GetSandboxSession(c.Request().Context(), id); err == nil && isSpotSandboxSession(session) { + return c.JSON(http.StatusForbidden, map[string]any{ + "error": "spot sandboxes are fixed at 1 vCPU and 1024 MB in alpha", + "code": "sandbox_family_scale_disabled", + }) + } if locked, err := s.store.GetScalingLock(c.Request().Context(), id); err == nil && locked { return c.JSON(http.StatusForbidden, map[string]any{ "error": "scaling is locked on this sandbox — unlock via PUT /scaling-lock to allow size changes", @@ -1338,6 +1370,12 @@ func (s *Server) setLimitsRemote(c echo.Context, sandboxID string, maxPids int32 if err != nil { return c.JSON(http.StatusNotFound, map[string]string{"error": "sandbox not found"}) } + if isSpotSandboxSession(session) { + return c.JSON(http.StatusForbidden, map[string]any{ + "error": "spot sandboxes are fixed at 1 vCPU and 1024 MB in alpha", + "code": "sandbox_family_scale_disabled", + }) + } if session.Status != "running" { return c.JSON(http.StatusBadRequest, map[string]string{"error": "sandbox is not running"}) } diff --git a/internal/api/sandbox_autoscale.go b/internal/api/sandbox_autoscale.go index 038434e9..1d86802e 100644 --- a/internal/api/sandbox_autoscale.go +++ b/internal/api/sandbox_autoscale.go @@ -45,6 +45,12 @@ func (s *Server) setAutoscale(c echo.Context) error { } if req.Enabled { + if session, err := s.store.GetSandboxSession(c.Request().Context(), sandboxID); err == nil && isSpotSandboxSession(session) { + return c.JSON(http.StatusForbidden, map[string]any{ + "error": "spot sandboxes are fixed at 1 vCPU and 1024 MB in alpha", + "code": "sandbox_family_scale_disabled", + }) + } // Refuse if the sandbox is scaling-locked. The lock auto-disables // autoscale on toggle; refusing here prevents a user from // re-enabling it while the lock is still on (which would be a @@ -151,6 +157,9 @@ func (a *AutoscalerSetter) SetSandboxMemoryMB(ctx context.Context, sandboxID str // shouldn't have picked them, but better to no-op cleanly than fail. return nil } + if isSpotSandboxSession(session) { + return fmt.Errorf("sandbox family spot is fixed at 1024 MB; refusing autoscale to %d", memoryMB) + } // Plan cap: the autoscaler runs in-process with no cap-token to read plan // from, so it asks the edge (D1 authority) before growing past the diff --git a/pkg/types/sandbox.go b/pkg/types/sandbox.go index 9919dd47..39932462 100644 --- a/pkg/types/sandbox.go +++ b/pkg/types/sandbox.go @@ -16,42 +16,52 @@ const ( SandboxStatusHibernated SandboxStatus = "hibernated" ) +const ( + SandboxFamilyDefault = "" + SandboxFamilySpot = "spot" +) + // Sandbox represents a running sandbox instance. type Sandbox struct { - ID string `json:"sandboxID"` - Template string `json:"templateID,omitempty"` - Alias string `json:"alias,omitempty"` - ClientID string `json:"clientID,omitempty"` - Status SandboxStatus `json:"status"` - StartedAt time.Time `json:"startedAt"` - EndAt time.Time `json:"endAt"` - Metadata map[string]string `json:"metadata,omitempty"` - CpuCount int `json:"cpuCount"` - MemoryMB int `json:"memoryMB"` - MachineID string `json:"machineID,omitempty"` + ID string `json:"sandboxID"` + Template string `json:"templateID,omitempty"` + Alias string `json:"alias,omitempty"` + ClientID string `json:"clientID,omitempty"` + Status SandboxStatus `json:"status"` + StartedAt time.Time `json:"startedAt"` + EndAt time.Time `json:"endAt"` + Metadata map[string]string `json:"metadata,omitempty"` + CpuCount int `json:"cpuCount"` + MemoryMB int `json:"memoryMB"` + SandboxFamily string `json:"sandboxFamily,omitempty"` + MachineID string `json:"machineID,omitempty"` // ConnectURL and Token are currently unused by SDKs. All data-plane traffic // flows through the control plane's SandboxAPIProxy, which proxies to workers // over the internal VPC network. Direct worker access support coming in a future release. - ConnectURL string `json:"connectURL,omitempty"` - Token string `json:"token,omitempty"` - HostPort int `json:"hostPort,omitempty"` // Mapped host port for the sandbox's container port + ConnectURL string `json:"connectURL,omitempty"` + Token string `json:"token,omitempty"` + HostPort int `json:"hostPort,omitempty"` // Mapped host port for the sandbox's container port } // SandboxConfig is the request body for creating a sandbox. type SandboxConfig struct { - Template string `json:"templateID,omitempty"` - Alias string `json:"alias,omitempty"` - Metadata map[string]string `json:"metadata,omitempty"` - Timeout int `json:"timeout,omitempty"` // seconds, default 300 - CpuCount int `json:"cpuCount,omitempty"` // default 1 - MemoryMB int `json:"memoryMB,omitempty"` // default 256 - DiskMB int `json:"diskMB,omitempty"` // workspace disk in MB (default 20480) - Envs map[string]string `json:"envs,omitempty"` - Port int `json:"port,omitempty"` // container port to expose via subdomain (default 80) + Template string `json:"templateID,omitempty"` + Alias string `json:"alias,omitempty"` + Metadata map[string]string `json:"metadata,omitempty"` + Timeout int `json:"timeout,omitempty"` // seconds, default 300 + CpuCount int `json:"cpuCount,omitempty"` // default 1 + MemoryMB int `json:"memoryMB,omitempty"` // default 256 + DiskMB int `json:"diskMB,omitempty"` // workspace disk in MB (default 20480) + // SandboxFamily selects an alpha placement/resource family. Empty is the + // default on-demand family. "spot" routes through spot-only capacity and is + // currently restricted to 1 vCPU / 1024 MB with scaling disabled. + SandboxFamily string `json:"sandboxFamily,omitempty"` + Envs map[string]string `json:"envs,omitempty"` + Port int `json:"port,omitempty"` // container port to expose via subdomain (default 80) // NetworkEnabled is a pointer so we can distinguish "unset" from // "explicitly false". Unset defaults to true (see IsNetworkEnabled). - NetworkEnabled *bool `json:"networkEnabled,omitempty"` - ImageRef string `json:"imageRef,omitempty"` // resolved ECR URI for custom templates + NetworkEnabled *bool `json:"networkEnabled,omitempty"` + ImageRef string `json:"imageRef,omitempty"` // resolved ECR URI for custom templates // Sandbox snapshot template: S3 keys for rootfs and workspace drives. // When set, the sandbox boots from these drives instead of the standard base image. TemplateRootfsKey string `json:"templateRootfsKey,omitempty"` @@ -188,6 +198,38 @@ func ValidateResourceTier(cfg *SandboxConfig) error { return fmt.Errorf("cpuCount %d and memoryMB %d do not match an allowed tier; valid combinations: 1/1024, 1/4096, 2/8192, 4/16384, 8/32768, 16/65536", cfg.CpuCount, cfg.MemoryMB) } +// ApplySandboxFamilyDefaultsAndValidate normalizes alpha sandbox-family options +// before regular resource-tier validation. +func ApplySandboxFamilyDefaultsAndValidate(cfg *SandboxConfig) error { + switch cfg.SandboxFamily { + case SandboxFamilyDefault: + return nil + case "default": + cfg.SandboxFamily = SandboxFamilyDefault + return nil + case SandboxFamilySpot: + if len(cfg.ImageManifest) > 0 || cfg.Snapshot != "" { + return fmt.Errorf("sandboxFamily %q does not support image or snapshot creates in alpha", SandboxFamilySpot) + } + if cfg.MemoryMB == 0 { + cfg.MemoryMB = 1024 + } + if cfg.CpuCount == 0 { + cfg.CpuCount = 1 + } + if cfg.MemoryMB != 1024 || cfg.CpuCount != 1 { + return fmt.Errorf("sandboxFamily %q is currently limited to 1 vCPU and 1024 MB memory", SandboxFamilySpot) + } + return nil + default: + return fmt.Errorf("unsupported sandboxFamily %q", cfg.SandboxFamily) + } +} + +func (c SandboxConfig) IsSpotFamily() bool { + return c.SandboxFamily == SandboxFamilySpot +} + // SandboxListResponse is the response for listing sandboxes. type SandboxListResponse struct { Sandboxes []Sandbox `json:"sandboxes"` diff --git a/pkg/types/sandbox_family_test.go b/pkg/types/sandbox_family_test.go new file mode 100644 index 00000000..41834b92 --- /dev/null +++ b/pkg/types/sandbox_family_test.go @@ -0,0 +1,53 @@ +package types + +import ( + "encoding/json" + "strings" + "testing" +) + +func TestApplySandboxFamilyDefaultsAndValidateSpotDefaultsToSmallestTier(t *testing.T) { + cfg := SandboxConfig{SandboxFamily: SandboxFamilySpot} + + if err := ApplySandboxFamilyDefaultsAndValidate(&cfg); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if cfg.CpuCount != 1 || cfg.MemoryMB != 1024 { + t.Fatalf("expected spot defaults 1 cpu / 1024MB, got cpu=%d memory=%d", cfg.CpuCount, cfg.MemoryMB) + } + if err := ValidateResourceTier(&cfg); err != nil { + t.Fatalf("spot defaults should be a valid tier: %v", err) + } +} + +func TestApplySandboxFamilyDefaultsAndValidateSpotRejectsLargerTier(t *testing.T) { + cfg := SandboxConfig{SandboxFamily: SandboxFamilySpot, CpuCount: 1, MemoryMB: 4096} + + err := ApplySandboxFamilyDefaultsAndValidate(&cfg) + if err == nil || !strings.Contains(err.Error(), "limited to 1 vCPU and 1024 MB") { + t.Fatalf("expected spot size rejection, got %v", err) + } +} + +func TestApplySandboxFamilyDefaultsAndValidateSpotRejectsSnapshotAndImage(t *testing.T) { + for name, cfg := range map[string]SandboxConfig{ + "snapshot": {SandboxFamily: SandboxFamilySpot, Snapshot: "snap"}, + "image": {SandboxFamily: SandboxFamilySpot, ImageManifest: json.RawMessage(`{"steps":[]}`)}, + } { + t.Run(name, func(t *testing.T) { + err := ApplySandboxFamilyDefaultsAndValidate(&cfg) + if err == nil || !strings.Contains(err.Error(), "does not support image or snapshot") { + t.Fatalf("expected image/snapshot rejection, got %v", err) + } + }) + } +} + +func TestApplySandboxFamilyDefaultsAndValidateRejectsUnknownFamily(t *testing.T) { + cfg := SandboxConfig{SandboxFamily: "gpu"} + + err := ApplySandboxFamilyDefaultsAndValidate(&cfg) + if err == nil || !strings.Contains(err.Error(), "unsupported sandboxFamily") { + t.Fatalf("expected unsupported family rejection, got %v", err) + } +} diff --git a/sdks/python/opencomputer/__init__.py b/sdks/python/opencomputer/__init__.py index ae6b3471..4e98c0c9 100644 --- a/sdks/python/opencomputer/__init__.py +++ b/sdks/python/opencomputer/__init__.py @@ -1,6 +1,6 @@ """OpenComputer Python SDK - cloud sandbox platform.""" -from opencomputer.sandbox import Sandbox, ScalingLockedError, PlanLimitError +from opencomputer.sandbox import Sandbox, ScalingLockedError, PlanLimitError, SandboxFamilyLimitError from opencomputer.agent import Agent, AgentEvent, AgentSession, AgentSessionInfo from opencomputer.filesystem import Filesystem from opencomputer.exec import Exec, ProcessResult, ExecSession, ExecSessionInfo @@ -29,6 +29,7 @@ "Sandbox", "ScalingLockedError", "PlanLimitError", + "SandboxFamilyLimitError", "Agent", "AgentEvent", "AgentSession", diff --git a/sdks/python/opencomputer/sandbox.py b/sdks/python/opencomputer/sandbox.py index 5fe14e11..077044d3 100644 --- a/sdks/python/opencomputer/sandbox.py +++ b/sdks/python/opencomputer/sandbox.py @@ -33,6 +33,15 @@ class PlanLimitError(Exception): """ +class SandboxFamilyLimitError(Exception): + """Raised when a resource-changing call is blocked by the sandbox's + family. Alpha spot sandboxes are fixed at 1 vCPU / 1024 MB and cannot be + scaled. + """ + + code = "sandbox_family_scale_disabled" + + def _raise_scaling_error(resp: httpx.Response, action: str) -> None: """Inspect a non-OK scaling response and raise the most specific error. Falls back to ``raise_for_status`` so callers still see HTTP details for @@ -43,6 +52,8 @@ def _raise_scaling_error(resp: httpx.Response, action: str) -> None: body = {} if resp.status_code == 403 and isinstance(body, dict) and body.get("code") == "scaling_locked": raise ScalingLockedError(body.get("error", "scaling is locked on this sandbox")) + if resp.status_code == 403 and isinstance(body, dict) and body.get("code") == "sandbox_family_scale_disabled": + raise SandboxFamilyLimitError(body.get("error", "sandbox family does not allow scaling")) if resp.status_code == 402: msg = body.get("error", "plan limit exceeded") if isinstance(body, dict) else "plan limit exceeded" raise PlanLimitError(msg) @@ -75,6 +86,7 @@ async def create( api_url: str | None = None, envs: dict[str, str] | None = None, metadata: dict[str, str] | None = None, + sandbox_family: str | None = None, disk_mb: int | None = None, memory_mb: int | None = None, secret_store: str | None = None, @@ -91,6 +103,8 @@ async def create( api_url: API URL (or OPENCOMPUTER_API_URL env var). envs: Environment variables to inject. Overrides store secrets. metadata: Custom metadata key-value pairs. + sandbox_family: Alpha placement/resource family. ``"spot"`` uses + spot-only capacity and is limited to 1 vCPU / 1024 MB. disk_mb: Workspace disk size in MB (default 20480 = 20GB). Any additional GB above 20GB is metered at a per-second rate comparable to EBS gp3. Closed beta: requests above 20GB @@ -134,6 +148,8 @@ async def create( body["envs"] = envs if metadata: body["metadata"] = metadata + if sandbox_family: + body["sandboxFamily"] = sandbox_family if disk_mb is not None: body["diskMB"] = disk_mb if memory_mb is not None: diff --git a/sdks/typescript/src/index.ts b/sdks/typescript/src/index.ts index 1bd215b0..e3512d49 100644 --- a/sdks/typescript/src/index.ts +++ b/sdks/typescript/src/index.ts @@ -2,6 +2,7 @@ export { Sandbox, ScalingLockedError, PlanLimitError, + SandboxFamilyLimitError, type SandboxOpts, type CheckpointInfo, type PatchInfo, diff --git a/sdks/typescript/src/sandbox.ts b/sdks/typescript/src/sandbox.ts index b1f9cb1a..8919bdfd 100644 --- a/sdks/typescript/src/sandbox.ts +++ b/sdks/typescript/src/sandbox.ts @@ -13,6 +13,8 @@ function resolveApiUrl(url: string): string { export interface SandboxOpts { template?: string; + /** Alpha placement/resource family. "spot" uses spot-only capacity and is limited to 1 vCPU / 1024 MB. */ + sandboxFamily?: "spot"; /** * Idle timeout in seconds after which the sandbox auto-hibernates. * Default: `0` (persistent — never auto-hibernate). @@ -46,6 +48,7 @@ interface SandboxData { sandboxID: string; status: string; templateID?: string; + sandboxFamily?: string; connectURL?: string; token?: string; sandboxDomain?: string; @@ -117,6 +120,18 @@ export class PlanLimitError extends Error { } } +/** + * Thrown when a resource-changing call is blocked by the sandbox's family. + * Alpha spot sandboxes are fixed at 1 vCPU / 1024 MB and cannot be scaled. + */ +export class SandboxFamilyLimitError extends Error { + readonly code = "sandbox_family_scale_disabled"; + constructor(message?: string) { + super(message ?? "sandbox family does not allow scaling"); + this.name = "SandboxFamilyLimitError"; + } +} + /** * Inspect a non-OK response from a scaling endpoint and throw the most * specific error type. Falls back to a generic Error when the response @@ -133,6 +148,9 @@ async function throwScalingError(resp: Response, action: string): Promise if (resp.status === 403 && body.code === "scaling_locked") { throw new ScalingLockedError(body.error); } + if (resp.status === 403 && body.code === "sandbox_family_scale_disabled") { + throw new SandboxFamilyLimitError(body.error); + } if (resp.status === 402) { throw new PlanLimitError(body.error); } @@ -245,6 +263,7 @@ export class Sandbox { }; if (opts.envs) body.envs = opts.envs; if (opts.metadata) body.metadata = opts.metadata; + if (opts.sandboxFamily) body.sandboxFamily = opts.sandboxFamily; if (opts.cpuCount != null) body.cpuCount = opts.cpuCount; if (opts.memoryMB != null) body.memoryMB = opts.memoryMB; if (opts.diskMB != null) body.diskMB = opts.diskMB; From 732e40b5c31f0aefecf03a8e06da28a0ae848059 Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Wed, 3 Jun 2026 14:06:57 -0700 Subject: [PATCH 12/32] docs: add alpha spot sandboxes --- docs/api-reference/sandboxes/create.mdx | 4 + docs/docs.json | 7 + docs/reference/python-sdk/sandbox.mdx | 21 ++- docs/reference/python-sdk/scaling.mdx | 21 ++- docs/reference/typescript-sdk/sandbox.mdx | 19 ++ docs/reference/typescript-sdk/scaling.mdx | 23 ++- docs/sandboxes/spot-sandboxes.mdx | 203 ++++++++++++++++++++++ 7 files changed, 295 insertions(+), 3 deletions(-) create mode 100644 docs/sandboxes/spot-sandboxes.mdx diff --git a/docs/api-reference/sandboxes/create.mdx b/docs/api-reference/sandboxes/create.mdx index bc16af7f..9911a23d 100644 --- a/docs/api-reference/sandboxes/create.mdx +++ b/docs/api-reference/sandboxes/create.mdx @@ -21,6 +21,10 @@ Create a new sandbox. Memory in MB. Must match an allowed tier: `1024`, `4096`, `8192`, or `16384`. If omitted but `cpuCount` is set, inferred automatically. + + Alpha placement/resource family. Set to `"spot"` to create a [spot sandbox](/sandboxes/spot-sandboxes), currently fixed at 1 vCPU / 1 GB with scaling disabled. + + The allowed CPU/memory combinations are: | Memory | vCPU | diff --git a/docs/docs.json b/docs/docs.json index da0d6124..7fe28a75 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -46,6 +46,13 @@ "sandboxes/patches", "sandboxes/preview-urls", "sandboxes/elasticity", + { + "group": "Spot Sandboxes", + "tag": "Alpha", + "pages": [ + "sandboxes/spot-sandboxes" + ] + }, { "group": "Usage", "tag": "Preview", diff --git a/docs/reference/python-sdk/sandbox.mdx b/docs/reference/python-sdk/sandbox.mdx index 04c1cced..4fe4da40 100644 --- a/docs/reference/python-sdk/sandbox.mdx +++ b/docs/reference/python-sdk/sandbox.mdx @@ -33,6 +33,10 @@ Create a new sandbox. [HTTP API →](/api-reference/sandboxes/create) Arbitrary metadata + + Alpha placement/resource family. Set to `"spot"` for a [spot sandbox](/sandboxes/spot-sandboxes), currently fixed at 1 vCPU / 1 GB with active migration and scaling disabled. + + Declarative image definition (see [Image](/reference/python-sdk/image)) @@ -51,8 +55,23 @@ Create a new sandbox. [HTTP API →](/api-reference/sandboxes/create) sandbox = await Sandbox.create(template="my-stack", timeout=600) ``` +Create a spot sandbox: + +```python +sandbox = await Sandbox.create( + sandbox_family="spot", + timeout=300, +) +``` + + + Spot sandboxes are alpha. They run on spare capacity, are actively migrated + away from draining workers, and are priced roughly 3x cheaper than on-demand + sandboxes. They are currently limited to 1 vCPU / 1 GB and cannot be scaled. + + - `cpuCount` and `memoryMB` are not available in the Python SDK. Use the [HTTP API](/api-reference/sandboxes/create) for custom resources. + Custom `cpuCount` and `memoryMB` are not available in the Python SDK. Use the [HTTP API](/api-reference/sandboxes/create) for custom resources. Spot sandboxes do not need custom resource fields because the server applies the 1 vCPU / 1 GB alpha size automatically. --- diff --git a/docs/reference/python-sdk/scaling.mdx b/docs/reference/python-sdk/scaling.mdx index 214f16c9..9c478f2d 100644 --- a/docs/reference/python-sdk/scaling.mdx +++ b/docs/reference/python-sdk/scaling.mdx @@ -17,6 +17,7 @@ Memory tiers are fixed: `1024`, `4096`, `8192`, `16384`, `32768`, `65536` MB. CP - **Manual `scale()`** disables autoscale on the sandbox as a side effect — explicit user intent overrides the loop. Re-enable with `set_autoscale(enabled=True, ...)` after if you want. - **Setting a scaling lock** disables autoscale at the same time (single knob: "I don't want this scaling, period"). While locked, both `scale()` and `set_autoscale(enabled=True)` raise `ScalingLockedError`. Unlocking does NOT auto-re-enable autoscale. +- **Spot sandboxes** are fixed at 1 vCPU / 1 GB during alpha. `scale()` and `set_autoscale(enabled=True)` raise `SandboxFamilyLimitError`. - **Plan caps** apply everywhere. Free-tier orgs are capped at 4 GB. Calls above the cap raise `PlanLimitError`. --- @@ -34,10 +35,11 @@ Manually resize the sandbox. [HTTP API →](/api-reference/sandboxes/scale) **Raises:** - `ScalingLockedError` — sandbox has a scaling lock active. +- `SandboxFamilyLimitError` — sandbox belongs to a family with fixed resources, such as alpha spot sandboxes. - `PlanLimitError` — `memory_mb` exceeds the org's plan cap. ```python -from opencomputer import Sandbox, ScalingLockedError, PlanLimitError +from opencomputer import Sandbox, ScalingLockedError, PlanLimitError, SandboxFamilyLimitError sandbox = await Sandbox.connect("sb-abc123") @@ -46,6 +48,8 @@ try: print(f"scaled to {result['memoryMB']}MB / {result['cpuPercent']}% CPU") except ScalingLockedError: print("sandbox is locked — unlock to scale") +except SandboxFamilyLimitError: + print("spot sandboxes are fixed at 1 vCPU / 1 GB in alpha") except PlanLimitError: print("upgrade required for larger instances") ``` @@ -73,6 +77,7 @@ Enable or disable per-sandbox autoscale. **Raises:** - `ScalingLockedError` — sandbox has a scaling lock active. +- `SandboxFamilyLimitError` — sandbox belongs to a family with fixed resources, such as alpha spot sandboxes. - `PlanLimitError` — `max_memory_mb` exceeds the org's plan cap. When `enabled=True`, the platform watches the sandbox's memory pressure and resizes it within the bounds: @@ -169,3 +174,17 @@ except ScalingLockedError: ### `PlanLimitError` Raised when the requested size exceeds the org's plan cap. The HTTP API returns 402 Payment Required for this case. + +### `SandboxFamilyLimitError` + +Raised when the sandbox's family does not allow resource changes. Alpha [spot sandboxes](/sandboxes/spot-sandboxes) are fixed at 1 vCPU / 1 GB, so `scale()` and `set_autoscale(enabled=True)` raise this error. + +```python +from opencomputer import SandboxFamilyLimitError + +try: + await sandbox.scale(memory_mb=4096) +except SandboxFamilyLimitError: + # Use a new on-demand sandbox if this workload needs more memory. + ... +``` diff --git a/docs/reference/typescript-sdk/sandbox.mdx b/docs/reference/typescript-sdk/sandbox.mdx index 45525539..edb3f1aa 100644 --- a/docs/reference/typescript-sdk/sandbox.mdx +++ b/docs/reference/typescript-sdk/sandbox.mdx @@ -33,6 +33,10 @@ Create a new sandbox. [HTTP API →](/api-reference/sandboxes/create) Arbitrary metadata + + Alpha placement/resource family. Set to `"spot"` for a [spot sandbox](/sandboxes/spot-sandboxes), currently fixed at 1 vCPU / 1 GB with active migration and scaling disabled. + + CPU cores @@ -63,6 +67,21 @@ Create a new sandbox. [HTTP API →](/api-reference/sandboxes/create) const sandbox = await Sandbox.create({ template: "my-stack", timeout: 600 }); ``` +Create a spot sandbox: + +```typescript +const sandbox = await Sandbox.create({ + sandboxFamily: "spot", + timeout: 300, +}); +``` + + + Spot sandboxes are alpha. They run on spare capacity, are actively migrated + away from draining workers, and are priced roughly 3x cheaper than on-demand + sandboxes. They are currently limited to 1 vCPU / 1 GB and cannot be scaled. + + --- ### `Sandbox.connect(sandboxId, opts?)` diff --git a/docs/reference/typescript-sdk/scaling.mdx b/docs/reference/typescript-sdk/scaling.mdx index 4ca44ae5..dcdf37a1 100644 --- a/docs/reference/typescript-sdk/scaling.mdx +++ b/docs/reference/typescript-sdk/scaling.mdx @@ -17,6 +17,7 @@ Memory tiers are fixed: `1024`, `4096`, `8192`, `16384`, `32768`, `65536` MB. CP - **Manual `scale()`** disables autoscale on the sandbox as a side effect — explicit user intent overrides the loop. Re-enable autoscale with `setAutoscale({ enabled: true, ... })` after if you want. - **Setting a scaling lock** disables autoscale at the same time (single-knob: "I don't want this scaling, period"). While locked, both `scale()` and `setAutoscale({ enabled: true })` reject with `ScalingLockedError`. Unlocking does NOT auto-re-enable autoscale. +- **Spot sandboxes** are fixed at 1 vCPU / 1 GB during alpha. `scale()` and `setAutoscale({ enabled: true })` reject with `SandboxFamilyLimitError`. - **Plan caps** apply everywhere. Free-tier orgs are capped at 4 GB. Calls above the cap throw `PlanLimitError`. --- @@ -34,10 +35,11 @@ Manually resize the sandbox. [HTTP API →](/api-reference/sandboxes/scale) **Throws:** - `ScalingLockedError` — sandbox has a scaling lock active. +- `SandboxFamilyLimitError` — sandbox belongs to a family with fixed resources, such as alpha spot sandboxes. - `PlanLimitError` — `memoryMB` exceeds the org's plan cap. ```typescript -import { Sandbox, ScalingLockedError, PlanLimitError } from "@opencomputer/sdk"; +import { Sandbox, ScalingLockedError, PlanLimitError, SandboxFamilyLimitError } from "@opencomputer/sdk"; const sandbox = await Sandbox.connect("sb-abc123"); @@ -47,6 +49,8 @@ try { } catch (err) { if (err instanceof ScalingLockedError) { console.warn("sandbox is locked — unlock to scale"); + } else if (err instanceof SandboxFamilyLimitError) { + console.warn("spot sandboxes are fixed at 1 vCPU / 1 GB in alpha"); } else if (err instanceof PlanLimitError) { console.warn("upgrade required for larger instances"); } else { @@ -78,6 +82,7 @@ Enable or disable per-sandbox autoscale. **Throws:** - `ScalingLockedError` — sandbox has a scaling lock active. +- `SandboxFamilyLimitError` — sandbox belongs to a family with fixed resources, such as alpha spot sandboxes. - `PlanLimitError` — `maxMemoryMB` exceeds the org's plan cap. When `enabled=true`, the platform watches the sandbox's memory pressure and resizes it within the bounds: @@ -177,3 +182,19 @@ try { ### `PlanLimitError` Thrown when the requested size exceeds the org's plan cap. The HTTP API returns 402 Payment Required for this case. + +### `SandboxFamilyLimitError` + +Thrown when the sandbox's family does not allow resource changes. Alpha [spot sandboxes](/sandboxes/spot-sandboxes) are fixed at 1 vCPU / 1 GB, so `scale()` and `setAutoscale({ enabled: true })` throw this error. + +```typescript +import { SandboxFamilyLimitError } from "@opencomputer/sdk"; + +try { + await sandbox.scale({ memoryMB: 4096 }); +} catch (err) { + if (err instanceof SandboxFamilyLimitError) { + // Use a new on-demand sandbox if this workload needs more memory. + } +} +``` diff --git a/docs/sandboxes/spot-sandboxes.mdx b/docs/sandboxes/spot-sandboxes.mdx new file mode 100644 index 00000000..ea4e79ba --- /dev/null +++ b/docs/sandboxes/spot-sandboxes.mdx @@ -0,0 +1,203 @@ +--- +title: "Spot Sandboxes" +description: "Alpha sandbox family for interruption-tolerant, low-cost 1 vCPU / 1 GB workloads" +--- + + + Spot sandboxes are in alpha. They are only available for the 1 vCPU / 1 GB + sandbox size and cannot be scaled. OpenComputer actively migrates spot + sandboxes away from workers that are draining or at risk, but spot capacity is + still best-effort during alpha. + + +Spot sandboxes run on spare cloud capacity instead of standard on-demand capacity. They expose the same OpenComputer sandbox API for commands, files, PTY, preview URLs, and lifecycle operations. OpenComputer keeps spare spot capacity and migrates workloads between workers to make interruptions rare in normal operation, while still passing the lower infrastructure cost through to you. + +Use spot sandboxes when the work is cost-sensitive and can tolerate the small chance of an interruption. Avoid them for workloads where even a rare restart or disconnect would be unacceptable. + +## Pricing + +Spot sandboxes are priced roughly **3x cheaper than on-demand sandboxes**. + +The alpha offering is intentionally narrow: + +| Family | Size | Scaling | Availability | +| --- | --- | --- | --- | +| On-demand | Standard sandbox sizes | Supported | Standard capacity | +| Spot | 1 vCPU / 1 GB only | Disabled | Spare capacity with active migration | + +## Create a Spot Sandbox + +Set the sandbox family to `spot` when creating the sandbox. You do not need to pass `cpuCount` or `memoryMB`; spot sandboxes default to the only supported alpha size. + + + +```typescript TypeScript +import { Sandbox } from "@opencomputer/sdk"; + +const sandbox = await Sandbox.create({ + sandboxFamily: "spot", + timeout: 300, +}); + +const result = await sandbox.exec.run("echo hello from spot"); +console.log(result.stdout); + +await sandbox.kill(); +``` + +```python Python +from opencomputer import Sandbox + +async with await Sandbox.create( + sandbox_family="spot", + timeout=300, +) as sandbox: + result = await sandbox.exec.run("echo hello from spot") + print(result.stdout) +``` + +```bash HTTP +curl -X POST https://app.opencomputer.dev/api/sandboxes \ + -H "X-API-Key: $OPENCOMPUTER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "sandboxFamily": "spot", + "timeout": 300 + }' +``` + + + +You can also pass the explicit size if you want the request body to be self-documenting: + +```json +{ + "sandboxFamily": "spot", + "cpuCount": 1, + "memoryMB": 1024 +} +``` + +Requests for larger spot sandboxes are rejected during alpha. + +## What Happens on Interruption + +Spot capacity can be reclaimed by the cloud provider. OpenComputer monitors for worker drain and interruption signals, keeps spare capacity available, and attempts to live-migrate running spot sandboxes to another worker before users are affected. + +In normal operation, this means spot sandboxes should usually keep running without a visible restart. There are still important limits: + +- Migration is best-effort and depends on spare spot capacity being available. +- A sudden worker loss can still interrupt active terminals, sockets, subprocesses, and in-flight commands. +- If a worker disappears before migration completes, in-memory process state can be lost. +- The sandbox's persistent disk can be preserved and used to resume elsewhere. + +Design spot workloads to assume interruptions are uncommon but possible: keep important state durable, make long commands retryable, and reconnect clients after transient disconnects. + +## Good Fits + +Spot sandboxes work well for cost-sensitive workloads that benefit from active migration and can still tolerate the rare failed migration: + +- Batch code execution where failed jobs can be requeued. +- CI-style checks, test runners, linters, and formatters. +- AI agent tool calls with durable task state outside the sandbox. +- Parallel exploration jobs where losing one attempt is acceptable. +- Development, previews, and experiments where cost matters more than availability. + +## Poor Fits + +Use on-demand sandboxes instead when the residual interruption risk is unacceptable: + +- User-facing interactive sessions where a rare disconnect is still disruptive. +- Long-running stateful services that keep important state only in RAM. +- Databases, queues, or coordinators running inside the sandbox. +- Workloads with strict deadlines and no retry budget. +- Any task that cannot safely restart after a partial command failure. + +## Alpha Limitations + +During alpha, spot sandboxes have these restrictions: + +| Capability | Spot alpha behavior | +| --- | --- | +| Size | Fixed at 1 vCPU / 1 GB | +| Scaling | Manual scaling and autoscaling are disabled | +| Images and snapshots | Not supported for spot creates yet | +| Capacity | Best effort; create may fail or wait when spot capacity is full | +| Migration | Active live migration when workers drain or interruption signals arrive | + +If you call `scale` or enable autoscaling on a spot sandbox, the API returns a `sandbox_family_scale_disabled` error. + + + +```typescript TypeScript +import { Sandbox, SandboxFamilyLimitError } from "@opencomputer/sdk"; + +const sandbox = await Sandbox.create({ sandboxFamily: "spot" }); + +try { + await sandbox.scale({ memoryMB: 4096 }); +} catch (err) { + if (err instanceof SandboxFamilyLimitError) { + console.log("Spot sandboxes are fixed at 1 vCPU / 1 GB in alpha."); + } else { + throw err; + } +} +``` + +```python Python +from opencomputer import Sandbox, SandboxFamilyLimitError + +sandbox = await Sandbox.create(sandbox_family="spot") + +try: + await sandbox.scale(memory_mb=4096) +except SandboxFamilyLimitError: + print("Spot sandboxes are fixed at 1 vCPU / 1 GB in alpha.") +finally: + await sandbox.kill() +``` + + + +## Reliability Pattern + +For agent or batch systems, treat a spot sandbox as reliable-but-retryable compute: + +1. Store task state outside the sandbox. +2. Write outputs to durable storage as the task progresses. +3. Keep long commands idempotent so rare retries are safe. +4. On disconnect or sandbox error, reconnect first; if the sandbox cannot resume, create a new spot sandbox and retry the task. +5. Use on-demand sandboxes for workloads with no interruption budget. + +```typescript TypeScript +async function runRetryableTask(command: string) { + for (let attempt = 1; attempt <= 3; attempt++) { + const sandbox = await Sandbox.create({ + sandboxFamily: "spot", + timeout: 300, + metadata: { workload: "retryable-batch", attempt: String(attempt) }, + }); + + try { + const result = await sandbox.exec.run(command, { timeout: 120 }); + return result.stdout; + } finally { + await sandbox.kill().catch(() => {}); + } + } + + throw new Error("task failed after spot retries"); +} +``` + +## Related + + + + Resource tiers, autoscaling, and scaling limits + + + Save and resume sandbox state + + From ac863fdec375a7df3dd4fa44c952fe5fbcc56626 Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Wed, 3 Jun 2026 20:47:34 -0700 Subject: [PATCH 13/32] Fix golden snapshot agent transport reset --- cmd/agent/listen_linux.go | 13 +++++- cmd/agent/listen_linux_test.go | 28 ++++++++++++ internal/qemu/manager.go | 82 +++++++++++++++++++++++++++++----- 3 files changed, 110 insertions(+), 13 deletions(-) create mode 100644 cmd/agent/listen_linux_test.go diff --git a/cmd/agent/listen_linux.go b/cmd/agent/listen_linux.go index 33a4f917..9f324188 100644 --- a/cmd/agent/listen_linux.go +++ b/cmd/agent/listen_linux.go @@ -111,6 +111,10 @@ func (l *virtioSerialListener) Accept() (net.Conn, error) { } l.mu.Unlock() + // Drain stale frames from the previous gRPC session before accepting a + // fresh host connection after restore. + drainStaleData(l.f) + // Wait for the host to connect (port becomes readable with fresh data) if !waitForReadable(l.f, 500*time.Millisecond) { continue @@ -172,7 +176,14 @@ func waitForReadable(f *os.File, timeout time.Duration) bool { if err != nil || n <= 0 { return false } - return fds[0].Revents&(unix.POLLIN|unix.POLLHUP) != 0 + return readableForAccept(fds[0].Revents) +} + +func readableForAccept(revents int16) bool { + if revents&unix.POLLHUP != 0 { + return false + } + return revents&unix.POLLIN != 0 } // PrepareHibernate resets the active flag so that after migration restore, diff --git a/cmd/agent/listen_linux_test.go b/cmd/agent/listen_linux_test.go new file mode 100644 index 00000000..eb264afb --- /dev/null +++ b/cmd/agent/listen_linux_test.go @@ -0,0 +1,28 @@ +package main + +import ( + "testing" + + "golang.org/x/sys/unix" +) + +func TestReadableForAccept(t *testing.T) { + tests := []struct { + name string + revents int16 + want bool + }{ + {name: "pollin", revents: unix.POLLIN, want: true}, + {name: "pollhup", revents: unix.POLLHUP, want: false}, + {name: "pollin pollhup", revents: unix.POLLIN | unix.POLLHUP, want: false}, + {name: "pollerr", revents: unix.POLLERR, want: false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := readableForAccept(tt.revents); got != tt.want { + t.Fatalf("readableForAccept(%v) = %v, want %v", tt.revents, got, tt.want) + } + }) + } +} diff --git a/internal/qemu/manager.go b/internal/qemu/manager.go index 7c63aa4a..957f28fa 100644 --- a/internal/qemu/manager.go +++ b/internal/qemu/manager.go @@ -205,6 +205,64 @@ func quiesceAndCloseAgent(ctx context.Context, agent *AgentClient) error { return nil } +// resetAndCloseAgentTransportForSnapshot syncs the guest and resets the +// virtio-serial listener without freezing filesystems. Golden snapshots restore +// into live sandboxes immediately, so unlike hibernate we must not capture a +// frozen root filesystem. +func resetAndCloseAgentTransportForSnapshot(ctx context.Context, agent *AgentClient) error { + if agent == nil { + return nil + } + + prepareOnce := func() error { + rpcCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + _, err := agent.PrepareHibernate(rpcCtx, &pb.PrepareHibernateRequest{}) + return err + } + err := prepareOnce() + if err != nil && IsTransportError(err) { + log.Printf("qemu: snapshot PrepareHibernate transport error (%v), redialing", err) + if rdErr := agent.Redial(); rdErr == nil { + err = prepareOnce() + } else { + log.Printf("qemu: snapshot PrepareHibernate redial failed: %v (orig: %v)", rdErr, err) + } + } + if err != nil { + if st, ok := status.FromError(err); !ok || st.Code() != codes.Unimplemented { + log.Printf("qemu: snapshot PrepareHibernate RPC failed: %v (falling back to legacy path)", err) + } + execOnce := func() error { + execCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + _, e := agent.Exec(execCtx, &pb.ExecRequest{ + Command: "/bin/sh", + Args: []string{"-c", "sync; blockdev --flushbufs /dev/vda 2>/dev/null; blockdev --flushbufs /dev/vdb 2>/dev/null; sync; kill -USR1 1"}, + RunAsRoot: true, + }) + return e + } + fallbackErr := execOnce() + if fallbackErr != nil && IsTransportError(fallbackErr) { + log.Printf("qemu: snapshot prepare fallback Exec transport error (%v), redialing", fallbackErr) + if rdErr := agent.Redial(); rdErr == nil { + fallbackErr = execOnce() + } else { + log.Printf("qemu: snapshot prepare fallback redial failed: %v", rdErr) + } + } + if fallbackErr != nil { + return fmt.Errorf("%w: PrepareHibernate=%v, fallback Exec=%v", ErrAgentUnresponsive, err, fallbackErr) + } + time.Sleep(1 * time.Second) + } + + _ = agent.Close() + time.Sleep(200 * time.Millisecond) + return nil +} + // Compile-time check that Manager implements sandbox.Manager. var _ sandbox.Manager = (*Manager)(nil) @@ -904,19 +962,19 @@ func (m *Manager) PrepareGoldenSnapshot() error { log.Printf("qemu: golden: /home/sandbox unmounted and synced") } - // Close agent connection before migration. Use a timeout because gRPC's - // graceful close over vsock can hang if vhost-vsock doesn't drain cleanly. - closeDone := make(chan struct{}) - go func() { - agentClient.Close() - close(closeDone) - }() - select { - case <-closeDone: - case <-time.After(2 * time.Second): - log.Printf("qemu: golden: agent close timed out, proceeding anyway") + // Reset the in-guest virtio-serial listener before snapshotting. A plain + // host-side Close can leave the golden image with a stale gRPC parser state; + // restored VMs then read HTTP/2 frames but never answer Ping. + prepCtx, prepCancel := context.WithTimeout(context.Background(), 15*time.Second) + if err := resetAndCloseAgentTransportForSnapshot(prepCtx, agentClient); err != nil { + prepCancel() + qmpClient.Close() + cmd.Process.Kill() + cmd.Wait() + return fmt.Errorf("golden prepare agent transport: %w", err) } - time.Sleep(500 * time.Millisecond) + prepCancel() + log.Printf("qemu: golden: agent transport reset") // QMP stop + migrate log.Printf("qemu: golden: sending QMP stop...") From 6fdaafd28b750f0dd9694a1f3a2a8b844482447f Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Wed, 3 Jun 2026 20:57:24 -0700 Subject: [PATCH 14/32] Update spot sandbox pricing docs --- docs/reference/python-sdk/sandbox.mdx | 2 +- docs/reference/typescript-sdk/sandbox.mdx | 2 +- docs/sandboxes/spot-sandboxes.mdx | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/reference/python-sdk/sandbox.mdx b/docs/reference/python-sdk/sandbox.mdx index 4fe4da40..fb6dab21 100644 --- a/docs/reference/python-sdk/sandbox.mdx +++ b/docs/reference/python-sdk/sandbox.mdx @@ -66,7 +66,7 @@ sandbox = await Sandbox.create( Spot sandboxes are alpha. They run on spare capacity, are actively migrated - away from draining workers, and are priced roughly 3x cheaper than on-demand + away from draining workers, and are priced roughly 2x cheaper than on-demand sandboxes. They are currently limited to 1 vCPU / 1 GB and cannot be scaled. diff --git a/docs/reference/typescript-sdk/sandbox.mdx b/docs/reference/typescript-sdk/sandbox.mdx index edb3f1aa..cdd71207 100644 --- a/docs/reference/typescript-sdk/sandbox.mdx +++ b/docs/reference/typescript-sdk/sandbox.mdx @@ -78,7 +78,7 @@ const sandbox = await Sandbox.create({ Spot sandboxes are alpha. They run on spare capacity, are actively migrated - away from draining workers, and are priced roughly 3x cheaper than on-demand + away from draining workers, and are priced roughly 2x cheaper than on-demand sandboxes. They are currently limited to 1 vCPU / 1 GB and cannot be scaled. diff --git a/docs/sandboxes/spot-sandboxes.mdx b/docs/sandboxes/spot-sandboxes.mdx index ea4e79ba..b149c8af 100644 --- a/docs/sandboxes/spot-sandboxes.mdx +++ b/docs/sandboxes/spot-sandboxes.mdx @@ -16,7 +16,7 @@ Use spot sandboxes when the work is cost-sensitive and can tolerate the small ch ## Pricing -Spot sandboxes are priced roughly **3x cheaper than on-demand sandboxes**. +Spot sandboxes are priced roughly **2x cheaper than on-demand sandboxes**. The alpha offering is intentionally narrow: From 6cce58b154ec093b6fefafb1df63fedac8ed35c7 Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Wed, 3 Jun 2026 22:08:31 -0700 Subject: [PATCH 15/32] Add resumable sandbox recovery --- cmd/worker/main.go | 149 +++++++- docs/api-reference/sandboxes/create.mdx | 10 +- docs/docs.json | 4 +- docs/reference/python-sdk/sandbox.mdx | 18 +- docs/reference/python-sdk/scaling.mdx | 21 +- docs/reference/typescript-sdk/sandbox.mdx | 14 +- docs/reference/typescript-sdk/scaling.mdx | 23 +- docs/sandboxes/resumable-sandboxes.mdx | 149 ++++++++ docs/sandboxes/spot-sandboxes.mdx | 203 ---------- internal/api/admin_drain_visualizer.go | 437 ++++++++++++++++++++++ internal/api/router.go | 1 + internal/api/sandbox.go | 29 +- internal/api/sandbox_autoscale.go | 9 - internal/compute/ec2.go | 33 +- internal/compute/quota_test.go | 24 ++ internal/controlplane/reconcile.go | 139 +++++-- internal/controlplane/redis_registry.go | 1 + internal/controlplane/scaler.go | 35 ++ internal/db/store.go | 28 ++ internal/qemu/manager.go | 15 +- internal/qemu/migration.go | 9 + internal/worker/grpc_server.go | 4 + pkg/types/sandbox.go | 43 ++- pkg/types/sandbox_family_test.go | 53 +-- sdks/python/opencomputer/sandbox.py | 14 +- sdks/typescript/src/sandbox.ts | 8 +- 26 files changed, 1093 insertions(+), 380 deletions(-) create mode 100644 docs/sandboxes/resumable-sandboxes.mdx delete mode 100644 docs/sandboxes/spot-sandboxes.mdx create mode 100644 internal/api/admin_drain_visualizer.go diff --git a/cmd/worker/main.go b/cmd/worker/main.go index 1f4c0d04..8d947096 100644 --- a/cmd/worker/main.go +++ b/cmd/worker/main.go @@ -2,6 +2,7 @@ package main import ( "context" + "encoding/json" "fmt" "io" "log" @@ -710,8 +711,9 @@ func main() { go func() { notices := preemptMon.Watch(ctx) for notice := range notices { - log.Printf("opensandbox-worker: PREEMPTION notice from %s - action=%s eta=%s, draining now", + log.Printf("opensandbox-worker: PREEMPTION notice from %s - action=%s eta=%s, notifying resumable sandboxes", preemptMon.Name(), notice.Action, notice.ETA.Format(time.RFC3339)) + notifyResumableSandboxesBeforeRestart(context.Background(), mgr, store, sandboxDBMgr, 25*time.Second, notice.ETA) hb.Stop() return } @@ -922,6 +924,151 @@ func buildCheckpointBackend(label, endpoint, region, accessKeyID, secretAccessKe }) } +func notifyResumableSandboxesBeforeRestart(ctx context.Context, mgr sandbox.Manager, store *db.Store, sandboxDBs *sandbox.SandboxDBManager, noticeWindow time.Duration, eta time.Time) { + if mgr == nil { + return + } + wait := noticeWindow + if !eta.IsZero() { + until := time.Until(eta) + switch { + case until <= 0: + wait = 0 + case until <= 5*time.Second: + wait = until + case until-5*time.Second < wait: + wait = until - 5*time.Second + } + } + if wait < 0 { + wait = 0 + } + + listCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + sandboxes, err := mgr.List(listCtx) + cancel() + if err != nil { + log.Printf("opensandbox-worker: resumable restart notice: failed to list sandboxes: %v", err) + sleepContext(ctx, wait) + return + } + + started := time.Now() + noticeSeconds := int(wait.Seconds()) + if noticeSeconds < 0 { + noticeSeconds = 0 + } + hookTimeout := noticeSeconds + if hookTimeout <= 0 { + hookTimeout = 1 + } + if hookTimeout > 20 { + hookTimeout = 20 + } + + const restartNoticeScript = `notice="${OPENSANDBOX_RESUME_NOTICE_SECONDS:-25}" +export OPENSANDBOX_RESTART_NOTICE_SECONDS="$notice" +for hook in /etc/opencomputer/on-restart-notice /home/sandbox/.opencomputer/on-restart-notice; do + if [ -x "$hook" ]; then + "$hook" "$notice" + fi +done +sync` + + var wg sync.WaitGroup + notified := 0 + for _, sb := range sandboxes { + if sb.Status != "" && sb.Status != "running" { + continue + } + if !isResumableSandboxSession(ctx, store, sb.ID) { + continue + } + + notified++ + if sandboxDBs != nil { + if sdb, dbErr := sandboxDBs.Get(sb.ID); dbErr == nil { + _ = sdb.LogEvent("restart_notice", map[string]string{ + "sandbox_id": sb.ID, + "notice_seconds": fmt.Sprintf("%d", noticeSeconds), + "restart_reason": "worker_preemption", + "preserves_disk": "true", + "preserves_memory": "false", + }) + } + } + + sandboxID := sb.ID + wg.Add(1) + go func() { + defer wg.Done() + execCtx, execCancel := context.WithTimeout(ctx, time.Duration(hookTimeout)*time.Second) + defer execCancel() + _, err := mgr.Exec(execCtx, sandboxID, types.ProcessConfig{ + Command: "/bin/sh", + Args: []string{"-lc", restartNoticeScript}, + Env: map[string]string{ + "OPENSANDBOX_RESUMABLE": "true", + "OPENSANDBOX_RESUME_NOTICE_SECONDS": fmt.Sprintf("%d", noticeSeconds), + }, + Timeout: hookTimeout, + }) + if err != nil { + log.Printf("opensandbox-worker: resumable restart notice: hook failed for %s: %v", sandboxID, err) + } + }() + } + + if notified == 0 { + log.Printf("opensandbox-worker: resumable restart notice: no resumable sandboxes found") + sleepContext(ctx, wait) + return + } + log.Printf("opensandbox-worker: resumable restart notice: notifying %d sandboxes with %ds notice", notified, noticeSeconds) + wg.Wait() + + remaining := wait - time.Since(started) + if remaining > 0 { + sleepContext(ctx, remaining) + } + log.Printf("opensandbox-worker: resumable restart notice: completed for %d sandboxes", notified) +} + +func isResumableSandboxSession(ctx context.Context, store *db.Store, sandboxID string) bool { + if store == nil || sandboxID == "" { + return false + } + sessionCtx, cancel := context.WithTimeout(ctx, 2*time.Second) + defer cancel() + session, err := store.GetSandboxSession(sessionCtx, sandboxID) + if err != nil || session == nil { + if err != nil { + log.Printf("opensandbox-worker: resumable restart notice: failed to load sandbox session %s: %v", sandboxID, err) + } + return false + } + var cfg types.SandboxConfig + if len(session.Config) > 0 { + if err := json.Unmarshal(session.Config, &cfg); err != nil { + log.Printf("opensandbox-worker: resumable restart notice: failed to parse config for %s: %v", sandboxID, err) + return false + } + } + return cfg.IsResumable() +} + +func sleepContext(ctx context.Context, d time.Duration) { + if d <= 0 { + return + } + timer := time.NewTimer(d) + defer timer.Stop() + select { + case <-ctx.Done(): + case <-timer.C: + } +} + // createExecSessionQEMU creates an exec session using a QEMU agent client. func createExecSessionQEMU(agent *qm.AgentClient, sandboxID string, req types.ExecSessionCreateRequest) (*sandbox.ExecSessionHandle, error) { agentPB := &agentpb.ExecSessionCreateRequest{ diff --git a/docs/api-reference/sandboxes/create.mdx b/docs/api-reference/sandboxes/create.mdx index 9911a23d..d8a105bc 100644 --- a/docs/api-reference/sandboxes/create.mdx +++ b/docs/api-reference/sandboxes/create.mdx @@ -14,15 +14,15 @@ Create a new sandbox. - CPU cores. Must match an allowed tier: `1`, `2`, or `4`. If omitted but `memoryMB` is set, inferred automatically. + CPU cores. Must match an allowed tier: `1`, `2`, `4`, `8`, or `16`. If omitted but `memoryMB` is set, inferred automatically. - Memory in MB. Must match an allowed tier: `1024`, `4096`, `8192`, or `16384`. If omitted but `cpuCount` is set, inferred automatically. + Memory in MB. Must match an allowed tier: `1024`, `4096`, `8192`, `16384`, `32768`, or `65536`. If omitted but `cpuCount` is set, inferred automatically. - - Alpha placement/resource family. Set to `"spot"` to create a [spot sandbox](/sandboxes/spot-sandboxes), currently fixed at 1 vCPU / 1 GB with scaling disabled. + + Create a [resumable sandbox](/sandboxes/resumable-sandboxes). Disk is preserved across infrastructure restarts; processes may restart. The allowed CPU/memory combinations are: @@ -33,6 +33,8 @@ The allowed CPU/memory combinations are: | 4096 MB (4 GB) | 1 | | 8192 MB (8 GB) | 2 | | 16384 MB (16 GB) | 4 | +| 32768 MB (32 GB) | 8 | +| 65536 MB (64 GB) | 16 | The 1 GB tier provides 1 vCPU on a best-effort basis. For guaranteed CPU allocation, use the 4 GB tier or above. diff --git a/docs/docs.json b/docs/docs.json index 7fe28a75..493c780c 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -47,10 +47,10 @@ "sandboxes/preview-urls", "sandboxes/elasticity", { - "group": "Spot Sandboxes", + "group": "Resumable Sandboxes", "tag": "Alpha", "pages": [ - "sandboxes/spot-sandboxes" + "sandboxes/resumable-sandboxes" ] }, { diff --git a/docs/reference/python-sdk/sandbox.mdx b/docs/reference/python-sdk/sandbox.mdx index fb6dab21..97347f61 100644 --- a/docs/reference/python-sdk/sandbox.mdx +++ b/docs/reference/python-sdk/sandbox.mdx @@ -33,8 +33,8 @@ Create a new sandbox. [HTTP API →](/api-reference/sandboxes/create) Arbitrary metadata - - Alpha placement/resource family. Set to `"spot"` for a [spot sandbox](/sandboxes/spot-sandboxes), currently fixed at 1 vCPU / 1 GB with active migration and scaling disabled. + + Create a [resumable sandbox](/sandboxes/resumable-sandboxes). Disk is preserved across infrastructure restarts; processes may restart. @@ -55,23 +55,19 @@ Create a new sandbox. [HTTP API →](/api-reference/sandboxes/create) sandbox = await Sandbox.create(template="my-stack", timeout=600) ``` -Create a spot sandbox: +Create a resumable sandbox: ```python sandbox = await Sandbox.create( - sandbox_family="spot", + resumable=True, timeout=300, ) ``` - Spot sandboxes are alpha. They run on spare capacity, are actively migrated - away from draining workers, and are priced roughly 2x cheaper than on-demand - sandboxes. They are currently limited to 1 vCPU / 1 GB and cannot be scaled. - - - - Custom `cpuCount` and `memoryMB` are not available in the Python SDK. Use the [HTTP API](/api-reference/sandboxes/create) for custom resources. Spot sandboxes do not need custom resource fields because the server applies the 1 vCPU / 1 GB alpha size automatically. + Resumable sandboxes are alpha. They preserve filesystem state across + infrastructure restarts, may restart running processes, and are priced roughly + 2x cheaper than on-demand sandboxes. --- diff --git a/docs/reference/python-sdk/scaling.mdx b/docs/reference/python-sdk/scaling.mdx index 9c478f2d..214f16c9 100644 --- a/docs/reference/python-sdk/scaling.mdx +++ b/docs/reference/python-sdk/scaling.mdx @@ -17,7 +17,6 @@ Memory tiers are fixed: `1024`, `4096`, `8192`, `16384`, `32768`, `65536` MB. CP - **Manual `scale()`** disables autoscale on the sandbox as a side effect — explicit user intent overrides the loop. Re-enable with `set_autoscale(enabled=True, ...)` after if you want. - **Setting a scaling lock** disables autoscale at the same time (single knob: "I don't want this scaling, period"). While locked, both `scale()` and `set_autoscale(enabled=True)` raise `ScalingLockedError`. Unlocking does NOT auto-re-enable autoscale. -- **Spot sandboxes** are fixed at 1 vCPU / 1 GB during alpha. `scale()` and `set_autoscale(enabled=True)` raise `SandboxFamilyLimitError`. - **Plan caps** apply everywhere. Free-tier orgs are capped at 4 GB. Calls above the cap raise `PlanLimitError`. --- @@ -35,11 +34,10 @@ Manually resize the sandbox. [HTTP API →](/api-reference/sandboxes/scale) **Raises:** - `ScalingLockedError` — sandbox has a scaling lock active. -- `SandboxFamilyLimitError` — sandbox belongs to a family with fixed resources, such as alpha spot sandboxes. - `PlanLimitError` — `memory_mb` exceeds the org's plan cap. ```python -from opencomputer import Sandbox, ScalingLockedError, PlanLimitError, SandboxFamilyLimitError +from opencomputer import Sandbox, ScalingLockedError, PlanLimitError sandbox = await Sandbox.connect("sb-abc123") @@ -48,8 +46,6 @@ try: print(f"scaled to {result['memoryMB']}MB / {result['cpuPercent']}% CPU") except ScalingLockedError: print("sandbox is locked — unlock to scale") -except SandboxFamilyLimitError: - print("spot sandboxes are fixed at 1 vCPU / 1 GB in alpha") except PlanLimitError: print("upgrade required for larger instances") ``` @@ -77,7 +73,6 @@ Enable or disable per-sandbox autoscale. **Raises:** - `ScalingLockedError` — sandbox has a scaling lock active. -- `SandboxFamilyLimitError` — sandbox belongs to a family with fixed resources, such as alpha spot sandboxes. - `PlanLimitError` — `max_memory_mb` exceeds the org's plan cap. When `enabled=True`, the platform watches the sandbox's memory pressure and resizes it within the bounds: @@ -174,17 +169,3 @@ except ScalingLockedError: ### `PlanLimitError` Raised when the requested size exceeds the org's plan cap. The HTTP API returns 402 Payment Required for this case. - -### `SandboxFamilyLimitError` - -Raised when the sandbox's family does not allow resource changes. Alpha [spot sandboxes](/sandboxes/spot-sandboxes) are fixed at 1 vCPU / 1 GB, so `scale()` and `set_autoscale(enabled=True)` raise this error. - -```python -from opencomputer import SandboxFamilyLimitError - -try: - await sandbox.scale(memory_mb=4096) -except SandboxFamilyLimitError: - # Use a new on-demand sandbox if this workload needs more memory. - ... -``` diff --git a/docs/reference/typescript-sdk/sandbox.mdx b/docs/reference/typescript-sdk/sandbox.mdx index cdd71207..9c351332 100644 --- a/docs/reference/typescript-sdk/sandbox.mdx +++ b/docs/reference/typescript-sdk/sandbox.mdx @@ -33,8 +33,8 @@ Create a new sandbox. [HTTP API →](/api-reference/sandboxes/create) Arbitrary metadata - - Alpha placement/resource family. Set to `"spot"` for a [spot sandbox](/sandboxes/spot-sandboxes), currently fixed at 1 vCPU / 1 GB with active migration and scaling disabled. + + Create a [resumable sandbox](/sandboxes/resumable-sandboxes). Disk is preserved across infrastructure restarts; processes may restart. @@ -67,19 +67,19 @@ Create a new sandbox. [HTTP API →](/api-reference/sandboxes/create) const sandbox = await Sandbox.create({ template: "my-stack", timeout: 600 }); ``` -Create a spot sandbox: +Create a resumable sandbox: ```typescript const sandbox = await Sandbox.create({ - sandboxFamily: "spot", + resumable: true, timeout: 300, }); ``` - Spot sandboxes are alpha. They run on spare capacity, are actively migrated - away from draining workers, and are priced roughly 2x cheaper than on-demand - sandboxes. They are currently limited to 1 vCPU / 1 GB and cannot be scaled. + Resumable sandboxes are alpha. They preserve filesystem state across + infrastructure restarts, may restart running processes, and are priced roughly + 2x cheaper than on-demand sandboxes. --- diff --git a/docs/reference/typescript-sdk/scaling.mdx b/docs/reference/typescript-sdk/scaling.mdx index dcdf37a1..4ca44ae5 100644 --- a/docs/reference/typescript-sdk/scaling.mdx +++ b/docs/reference/typescript-sdk/scaling.mdx @@ -17,7 +17,6 @@ Memory tiers are fixed: `1024`, `4096`, `8192`, `16384`, `32768`, `65536` MB. CP - **Manual `scale()`** disables autoscale on the sandbox as a side effect — explicit user intent overrides the loop. Re-enable autoscale with `setAutoscale({ enabled: true, ... })` after if you want. - **Setting a scaling lock** disables autoscale at the same time (single-knob: "I don't want this scaling, period"). While locked, both `scale()` and `setAutoscale({ enabled: true })` reject with `ScalingLockedError`. Unlocking does NOT auto-re-enable autoscale. -- **Spot sandboxes** are fixed at 1 vCPU / 1 GB during alpha. `scale()` and `setAutoscale({ enabled: true })` reject with `SandboxFamilyLimitError`. - **Plan caps** apply everywhere. Free-tier orgs are capped at 4 GB. Calls above the cap throw `PlanLimitError`. --- @@ -35,11 +34,10 @@ Manually resize the sandbox. [HTTP API →](/api-reference/sandboxes/scale) **Throws:** - `ScalingLockedError` — sandbox has a scaling lock active. -- `SandboxFamilyLimitError` — sandbox belongs to a family with fixed resources, such as alpha spot sandboxes. - `PlanLimitError` — `memoryMB` exceeds the org's plan cap. ```typescript -import { Sandbox, ScalingLockedError, PlanLimitError, SandboxFamilyLimitError } from "@opencomputer/sdk"; +import { Sandbox, ScalingLockedError, PlanLimitError } from "@opencomputer/sdk"; const sandbox = await Sandbox.connect("sb-abc123"); @@ -49,8 +47,6 @@ try { } catch (err) { if (err instanceof ScalingLockedError) { console.warn("sandbox is locked — unlock to scale"); - } else if (err instanceof SandboxFamilyLimitError) { - console.warn("spot sandboxes are fixed at 1 vCPU / 1 GB in alpha"); } else if (err instanceof PlanLimitError) { console.warn("upgrade required for larger instances"); } else { @@ -82,7 +78,6 @@ Enable or disable per-sandbox autoscale. **Throws:** - `ScalingLockedError` — sandbox has a scaling lock active. -- `SandboxFamilyLimitError` — sandbox belongs to a family with fixed resources, such as alpha spot sandboxes. - `PlanLimitError` — `maxMemoryMB` exceeds the org's plan cap. When `enabled=true`, the platform watches the sandbox's memory pressure and resizes it within the bounds: @@ -182,19 +177,3 @@ try { ### `PlanLimitError` Thrown when the requested size exceeds the org's plan cap. The HTTP API returns 402 Payment Required for this case. - -### `SandboxFamilyLimitError` - -Thrown when the sandbox's family does not allow resource changes. Alpha [spot sandboxes](/sandboxes/spot-sandboxes) are fixed at 1 vCPU / 1 GB, so `scale()` and `setAutoscale({ enabled: true })` throw this error. - -```typescript -import { SandboxFamilyLimitError } from "@opencomputer/sdk"; - -try { - await sandbox.scale({ memoryMB: 4096 }); -} catch (err) { - if (err instanceof SandboxFamilyLimitError) { - // Use a new on-demand sandbox if this workload needs more memory. - } -} -``` diff --git a/docs/sandboxes/resumable-sandboxes.mdx b/docs/sandboxes/resumable-sandboxes.mdx new file mode 100644 index 00000000..21088307 --- /dev/null +++ b/docs/sandboxes/resumable-sandboxes.mdx @@ -0,0 +1,149 @@ +--- +title: "Resumable Sandboxes" +description: "Alpha lower-cost sandboxes that preserve disk across infrastructure restarts" +--- + + + Resumable sandboxes are in alpha. They preserve filesystem state across + infrastructure restarts, but running processes, in-memory state, terminal + sessions, and open network connections may restart. + + +Resumable sandboxes are lower-cost OpenComputer sandboxes designed for workloads that can recover from a process restart. They expose the same OpenComputer API for commands, files, PTY, preview URLs, images, snapshots, scaling, and lifecycle operations. + +If OpenComputer receives advance infrastructure interruption notice, the sandbox gets up to **25 seconds** to flush state before it restarts on healthy capacity. The sandbox's filesystem is preserved and made available after resume. + +## Pricing + +Resumable sandboxes are priced roughly **2x cheaper than on-demand sandboxes**. + +## Create a Resumable Sandbox + +Set `resumable: true` when creating the sandbox. + + + +```typescript TypeScript +import { Sandbox } from "@opencomputer/sdk"; + +const sandbox = await Sandbox.create({ + resumable: true, + memoryMB: 4096, + timeout: 300, +}); + +const result = await sandbox.exec.run("echo hello from resumable"); +console.log(result.stdout); + +await sandbox.kill(); +``` + +```python Python +from opencomputer import Sandbox + +async with await Sandbox.create( + resumable=True, + memory_mb=4096, + timeout=300, +) as sandbox: + result = await sandbox.exec.run("echo hello from resumable") + print(result.stdout) +``` + +```bash HTTP +curl -X POST https://app.opencomputer.dev/api/sandboxes \ + -H "X-API-Key: $OPENCOMPUTER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "resumable": true, + "memoryMB": 4096, + "timeout": 300 + }' +``` + + + +## What Happens on Interruption + +Infrastructure capacity can be reclaimed or restarted by the cloud provider. When OpenComputer receives advance notice, it notifies the sandbox and starts the resumability flow. + +The sandbox receives these environment variables: + +- `OPENSANDBOX_RESUMABLE=true` +- `OPENSANDBOX_RESUME_NOTICE_SECONDS=25` + +During the notice window, write important state to disk. After resume, restart your process from durable state. + +## Good Fits + +Resumable sandboxes work well for cost-sensitive workloads that can restart from disk: + +- Batch code execution where failed jobs can be requeued. +- CI-style checks, test runners, linters, and formatters. +- AI agent tool calls with durable task state outside the sandbox. +- Parallel exploration jobs where one attempt can restart. +- Development, previews, and experiments where cost matters more than availability. + +## Poor Fits + +Use on-demand sandboxes when process continuity is required: + +- User-facing interactive sessions where a rare disconnect is still disruptive. +- Long-running stateful services that keep important state only in RAM. +- Databases, queues, or coordinators running inside the sandbox. +- Workloads with strict deadlines and no retry budget. +- Any task that cannot safely restart after a partial command failure. + +## Alpha Limitations + +During alpha, resumable sandboxes have these restrictions: + +| Capability | Resumable alpha behavior | +| --- | --- | +| Process state | May restart | +| Filesystem | Preserved across resumable restarts | +| Notice window | Up to 25 seconds when advance notice is available | +| Sudden host failure | May resume without advance notice | +| Capacity | Best effort; create may fail or wait when resumable capacity is full | + +## Reliability Pattern + +For agent or batch systems, treat a resumable sandbox as durable-disk, restartable compute: + +1. Store task state on disk or outside the sandbox. +2. Write outputs to durable storage as the task progresses. +3. Keep long commands idempotent so restarts are safe. +4. On disconnect, reconnect and check the sandbox before retrying the task. +5. Use on-demand sandboxes for workloads with no interruption budget. + +```typescript TypeScript +async function runRetryableTask(command: string) { + for (let attempt = 1; attempt <= 3; attempt++) { + const sandbox = await Sandbox.create({ + resumable: true, + timeout: 300, + metadata: { workload: "retryable-batch", attempt: String(attempt) }, + }); + + try { + const result = await sandbox.exec.run(command, { timeout: 120 }); + return result.stdout; + } finally { + await sandbox.kill().catch(() => {}); + } + } + + throw new Error("task failed after resumable retries"); +} +``` + +## Related + + + + Resource tiers and autoscaling + + + Save and resume sandbox state + + diff --git a/docs/sandboxes/spot-sandboxes.mdx b/docs/sandboxes/spot-sandboxes.mdx deleted file mode 100644 index b149c8af..00000000 --- a/docs/sandboxes/spot-sandboxes.mdx +++ /dev/null @@ -1,203 +0,0 @@ ---- -title: "Spot Sandboxes" -description: "Alpha sandbox family for interruption-tolerant, low-cost 1 vCPU / 1 GB workloads" ---- - - - Spot sandboxes are in alpha. They are only available for the 1 vCPU / 1 GB - sandbox size and cannot be scaled. OpenComputer actively migrates spot - sandboxes away from workers that are draining or at risk, but spot capacity is - still best-effort during alpha. - - -Spot sandboxes run on spare cloud capacity instead of standard on-demand capacity. They expose the same OpenComputer sandbox API for commands, files, PTY, preview URLs, and lifecycle operations. OpenComputer keeps spare spot capacity and migrates workloads between workers to make interruptions rare in normal operation, while still passing the lower infrastructure cost through to you. - -Use spot sandboxes when the work is cost-sensitive and can tolerate the small chance of an interruption. Avoid them for workloads where even a rare restart or disconnect would be unacceptable. - -## Pricing - -Spot sandboxes are priced roughly **2x cheaper than on-demand sandboxes**. - -The alpha offering is intentionally narrow: - -| Family | Size | Scaling | Availability | -| --- | --- | --- | --- | -| On-demand | Standard sandbox sizes | Supported | Standard capacity | -| Spot | 1 vCPU / 1 GB only | Disabled | Spare capacity with active migration | - -## Create a Spot Sandbox - -Set the sandbox family to `spot` when creating the sandbox. You do not need to pass `cpuCount` or `memoryMB`; spot sandboxes default to the only supported alpha size. - - - -```typescript TypeScript -import { Sandbox } from "@opencomputer/sdk"; - -const sandbox = await Sandbox.create({ - sandboxFamily: "spot", - timeout: 300, -}); - -const result = await sandbox.exec.run("echo hello from spot"); -console.log(result.stdout); - -await sandbox.kill(); -``` - -```python Python -from opencomputer import Sandbox - -async with await Sandbox.create( - sandbox_family="spot", - timeout=300, -) as sandbox: - result = await sandbox.exec.run("echo hello from spot") - print(result.stdout) -``` - -```bash HTTP -curl -X POST https://app.opencomputer.dev/api/sandboxes \ - -H "X-API-Key: $OPENCOMPUTER_API_KEY" \ - -H "Content-Type: application/json" \ - -d '{ - "sandboxFamily": "spot", - "timeout": 300 - }' -``` - - - -You can also pass the explicit size if you want the request body to be self-documenting: - -```json -{ - "sandboxFamily": "spot", - "cpuCount": 1, - "memoryMB": 1024 -} -``` - -Requests for larger spot sandboxes are rejected during alpha. - -## What Happens on Interruption - -Spot capacity can be reclaimed by the cloud provider. OpenComputer monitors for worker drain and interruption signals, keeps spare capacity available, and attempts to live-migrate running spot sandboxes to another worker before users are affected. - -In normal operation, this means spot sandboxes should usually keep running without a visible restart. There are still important limits: - -- Migration is best-effort and depends on spare spot capacity being available. -- A sudden worker loss can still interrupt active terminals, sockets, subprocesses, and in-flight commands. -- If a worker disappears before migration completes, in-memory process state can be lost. -- The sandbox's persistent disk can be preserved and used to resume elsewhere. - -Design spot workloads to assume interruptions are uncommon but possible: keep important state durable, make long commands retryable, and reconnect clients after transient disconnects. - -## Good Fits - -Spot sandboxes work well for cost-sensitive workloads that benefit from active migration and can still tolerate the rare failed migration: - -- Batch code execution where failed jobs can be requeued. -- CI-style checks, test runners, linters, and formatters. -- AI agent tool calls with durable task state outside the sandbox. -- Parallel exploration jobs where losing one attempt is acceptable. -- Development, previews, and experiments where cost matters more than availability. - -## Poor Fits - -Use on-demand sandboxes instead when the residual interruption risk is unacceptable: - -- User-facing interactive sessions where a rare disconnect is still disruptive. -- Long-running stateful services that keep important state only in RAM. -- Databases, queues, or coordinators running inside the sandbox. -- Workloads with strict deadlines and no retry budget. -- Any task that cannot safely restart after a partial command failure. - -## Alpha Limitations - -During alpha, spot sandboxes have these restrictions: - -| Capability | Spot alpha behavior | -| --- | --- | -| Size | Fixed at 1 vCPU / 1 GB | -| Scaling | Manual scaling and autoscaling are disabled | -| Images and snapshots | Not supported for spot creates yet | -| Capacity | Best effort; create may fail or wait when spot capacity is full | -| Migration | Active live migration when workers drain or interruption signals arrive | - -If you call `scale` or enable autoscaling on a spot sandbox, the API returns a `sandbox_family_scale_disabled` error. - - - -```typescript TypeScript -import { Sandbox, SandboxFamilyLimitError } from "@opencomputer/sdk"; - -const sandbox = await Sandbox.create({ sandboxFamily: "spot" }); - -try { - await sandbox.scale({ memoryMB: 4096 }); -} catch (err) { - if (err instanceof SandboxFamilyLimitError) { - console.log("Spot sandboxes are fixed at 1 vCPU / 1 GB in alpha."); - } else { - throw err; - } -} -``` - -```python Python -from opencomputer import Sandbox, SandboxFamilyLimitError - -sandbox = await Sandbox.create(sandbox_family="spot") - -try: - await sandbox.scale(memory_mb=4096) -except SandboxFamilyLimitError: - print("Spot sandboxes are fixed at 1 vCPU / 1 GB in alpha.") -finally: - await sandbox.kill() -``` - - - -## Reliability Pattern - -For agent or batch systems, treat a spot sandbox as reliable-but-retryable compute: - -1. Store task state outside the sandbox. -2. Write outputs to durable storage as the task progresses. -3. Keep long commands idempotent so rare retries are safe. -4. On disconnect or sandbox error, reconnect first; if the sandbox cannot resume, create a new spot sandbox and retry the task. -5. Use on-demand sandboxes for workloads with no interruption budget. - -```typescript TypeScript -async function runRetryableTask(command: string) { - for (let attempt = 1; attempt <= 3; attempt++) { - const sandbox = await Sandbox.create({ - sandboxFamily: "spot", - timeout: 300, - metadata: { workload: "retryable-batch", attempt: String(attempt) }, - }); - - try { - const result = await sandbox.exec.run(command, { timeout: 120 }); - return result.stdout; - } finally { - await sandbox.kill().catch(() => {}); - } - } - - throw new Error("task failed after spot retries"); -} -``` - -## Related - - - - Resource tiers, autoscaling, and scaling limits - - - Save and resume sandbox state - - diff --git a/internal/api/admin_drain_visualizer.go b/internal/api/admin_drain_visualizer.go new file mode 100644 index 00000000..813dc507 --- /dev/null +++ b/internal/api/admin_drain_visualizer.go @@ -0,0 +1,437 @@ +package api + +import ( + "net/http" + + "github.com/labstack/echo/v4" +) + +// adminDrainVisualizerPage serves an operator-only view for watching live +// migration drains. It intentionally lives under /admin so the existing API-key +// middleware guards the drain/evacuate controls. +func (s *Server) adminDrainVisualizerPage(c echo.Context) error { + return c.HTML(http.StatusOK, adminDrainVisualizerHTML) +} + +const adminDrainVisualizerHTML = ` + + + + + OpenComputer drain visualizer + + + +
+

Drain visualizer

+
+ connecting + + + +
+
+
+
+
Workers
0
+
Sandboxes
0
+
Draining
0
+
Moves
0
+
+
+
+

Events

+
+
+
+ + +` diff --git a/internal/api/router.go b/internal/api/router.go index 2adca624..8773facc 100644 --- a/internal/api/router.go +++ b/internal/api/router.go @@ -277,6 +277,7 @@ func NewServer(mgr sandbox.Manager, ptyMgr *sandbox.PTYManager, apiKey string, o admin.POST("/workers/:id/evacuate", s.adminEvacuateWorker) admin.GET("/demo/migration", s.demoPingPongPage) admin.GET("/demo/chaos", s.demoChaosPage) + admin.GET("/demo/drain", s.adminDrainVisualizerPage) // Signed URL endpoints (self-authenticated via HMAC, no API key required) e.GET("/api/sandboxes/:id/files/download", s.signedDownload) diff --git a/internal/api/sandbox.go b/internal/api/sandbox.go index 99af005b..0537f4e8 100644 --- a/internal/api/sandbox.go +++ b/internal/api/sandbox.go @@ -37,6 +37,7 @@ func (s *Server) createSandbox(c echo.Context) error { "error": err.Error(), }) } + cfg = withResumableSandboxEnv(cfg) // Validate CPU/memory against allowed tiers. // Allowed tiers (memoryMB → vCPU): 1024→1, 4096→1, 8192→2, 16384→4, 32768→8, 65536→16. @@ -185,6 +186,7 @@ func (s *Server) createSandbox(c echo.Context) error { }) } sb.SandboxFamily = cfg.SandboxFamily + sb.Resumable = cfg.IsResumable() // Register with sandbox router for rolling timeout tracking. // timeout == 0 means "persistent" (no auto-hibernate). Negative values are @@ -685,6 +687,7 @@ func (s *Server) createSandboxRemote(c echo.Context, ctx context.Context, cfg ty "cpuCount": cfg.CpuCount, "memoryMB": cfg.MemoryMB, "sandboxFamily": cfg.SandboxFamily, + "resumable": cfg.IsResumable(), } if s.sandboxDomain != "" { resp["sandboxDomain"] = s.sandboxDomain @@ -1210,6 +1213,20 @@ func isSpotSandboxSession(session *db.SandboxSession) bool { return sandboxSessionFamily(session) == types.SandboxFamilySpot } +func withResumableSandboxEnv(cfg types.SandboxConfig) types.SandboxConfig { + if !cfg.IsResumable() { + return cfg + } + envs := make(map[string]string, len(cfg.Envs)+2) + for k, v := range cfg.Envs { + envs[k] = v + } + envs["OPENSANDBOX_RESUMABLE"] = "true" + envs["OPENSANDBOX_RESUME_NOTICE_SECONDS"] = "25" + cfg.Envs = envs + return cfg +} + func (s *Server) setLimits(c echo.Context) error { id := c.Param("id") ctx := c.Request().Context() @@ -1305,12 +1322,6 @@ func (s *Server) scaleSandbox(c echo.Context) error { // resources. Same code that the autoscale endpoint and the autoscaler // loop use, so SDK consumers can branch on a single error code. if s.store != nil { - if session, err := s.store.GetSandboxSession(c.Request().Context(), id); err == nil && isSpotSandboxSession(session) { - return c.JSON(http.StatusForbidden, map[string]any{ - "error": "spot sandboxes are fixed at 1 vCPU and 1024 MB in alpha", - "code": "sandbox_family_scale_disabled", - }) - } if locked, err := s.store.GetScalingLock(c.Request().Context(), id); err == nil && locked { return c.JSON(http.StatusForbidden, map[string]any{ "error": "scaling is locked on this sandbox — unlock via PUT /scaling-lock to allow size changes", @@ -1370,12 +1381,6 @@ func (s *Server) setLimitsRemote(c echo.Context, sandboxID string, maxPids int32 if err != nil { return c.JSON(http.StatusNotFound, map[string]string{"error": "sandbox not found"}) } - if isSpotSandboxSession(session) { - return c.JSON(http.StatusForbidden, map[string]any{ - "error": "spot sandboxes are fixed at 1 vCPU and 1024 MB in alpha", - "code": "sandbox_family_scale_disabled", - }) - } if session.Status != "running" { return c.JSON(http.StatusBadRequest, map[string]string{"error": "sandbox is not running"}) } diff --git a/internal/api/sandbox_autoscale.go b/internal/api/sandbox_autoscale.go index 1d86802e..038434e9 100644 --- a/internal/api/sandbox_autoscale.go +++ b/internal/api/sandbox_autoscale.go @@ -45,12 +45,6 @@ func (s *Server) setAutoscale(c echo.Context) error { } if req.Enabled { - if session, err := s.store.GetSandboxSession(c.Request().Context(), sandboxID); err == nil && isSpotSandboxSession(session) { - return c.JSON(http.StatusForbidden, map[string]any{ - "error": "spot sandboxes are fixed at 1 vCPU and 1024 MB in alpha", - "code": "sandbox_family_scale_disabled", - }) - } // Refuse if the sandbox is scaling-locked. The lock auto-disables // autoscale on toggle; refusing here prevents a user from // re-enabling it while the lock is still on (which would be a @@ -157,9 +151,6 @@ func (a *AutoscalerSetter) SetSandboxMemoryMB(ctx context.Context, sandboxID str // shouldn't have picked them, but better to no-op cleanly than fail. return nil } - if isSpotSandboxSession(session) { - return fmt.Errorf("sandbox family spot is fixed at 1024 MB; refusing autoscale to %d", memoryMB) - } // Plan cap: the autoscaler runs in-process with no cap-token to read plan // from, so it asks the edge (D1 authority) before growing past the diff --git a/internal/compute/ec2.go b/internal/compute/ec2.go index 4d3673a4..52810b85 100644 --- a/internal/compute/ec2.go +++ b/internal/compute/ec2.go @@ -50,6 +50,16 @@ func wrapEC2CreateErr(err error, format string, args ...any) error { return wrapped } +func supportsEC2NestedVirtualization(instanceType string) bool { + family, _, _ := strings.Cut(strings.ToLower(instanceType), ".") + switch family { + case "c8i", "m8i", "r8i": + return true + default: + return false + } +} + const ( // AWS tag keys (kept consistent with the Azure pool's azure-prefixed tags). awsTagRole = "opensandbox:role" @@ -60,14 +70,14 @@ const ( // EC2PoolConfig configures the EC2 compute pool. type EC2PoolConfig struct { - Region string - AccessKeyID string // empty = use default credential chain (IAM role preferred) - SecretAccessKey string - AMI string // static AMI ID; empty if SSMParameterName is set - InstanceType string // e.g. "c7gd.metal", "r7gd.xlarge", "m7i.large" - SubnetID string - SecurityGroupID string - KeyName string // optional SSH key pair (debug use only) + Region string + AccessKeyID string // empty = use default credential chain (IAM role preferred) + SecretAccessKey string + AMI string // static AMI ID; empty if SSMParameterName is set + InstanceType string // e.g. "c7gd.metal", "r7gd.xlarge", "m7i.large" + SubnetID string + SecurityGroupID string + KeyName string // optional SSH key pair (debug use only) IAMInstanceProfile string // attached to instances; gives them Secrets Manager + S3 read SecretsARN string // Secrets Manager ARN; passed to worker via WorkerSpec.SecretsRef SSMParameterName string // SSM parameter for dynamic AMI ID (e.g. /opensandbox/dev/worker-ami-id) @@ -177,6 +187,12 @@ func (p *EC2Pool) CreateMachine(ctx context.Context, opts MachineOpts) (*Machine }, } + if supportsEC2NestedVirtualization(instanceType) { + input.CpuOptions = &ec2types.CpuOptionsRequest{ + NestedVirtualization: ec2types.NestedVirtualizationSpecificationEnabled, + } + } + if p.cfg.SubnetID != "" { input.SubnetId = aws.String(p.cfg.SubnetID) } @@ -485,4 +501,3 @@ func (p *EC2Pool) buildUserData(opts MachineOpts) string { return sb.String() } - diff --git a/internal/compute/quota_test.go b/internal/compute/quota_test.go index 422bd84c..f01c8ac8 100644 --- a/internal/compute/quota_test.go +++ b/internal/compute/quota_test.go @@ -60,6 +60,30 @@ func TestIsEC2QuotaErr(t *testing.T) { } } +func TestSupportsEC2NestedVirtualization(t *testing.T) { + tests := []struct { + instanceType string + want bool + }{ + {"r8i.48xlarge", true}, + {"m8i.4xlarge", true}, + {"c8i.2xlarge", true}, + {"R8I.48XLARGE", true}, + {"r5d.metal", false}, + {"c7gd.metal", false}, + {"m7i.large", false}, + {"", false}, + } + + for _, tc := range tests { + t.Run(tc.instanceType, func(t *testing.T) { + if got := supportsEC2NestedVirtualization(tc.instanceType); got != tc.want { + t.Fatalf("supportsEC2NestedVirtualization(%q) = %v, want %v", tc.instanceType, got, tc.want) + } + }) + } +} + func TestWrapAzureCreateErrTagsQuota(t *testing.T) { quotaSrc := errors.New("AllocationFailed: no capacity") wrapped := wrapAzureCreateErr(quotaSrc, "azure: create VM foo failed: %w", quotaSrc) diff --git a/internal/controlplane/reconcile.go b/internal/controlplane/reconcile.go index c6d265ad..aae73f3d 100644 --- a/internal/controlplane/reconcile.go +++ b/internal/controlplane/reconcile.go @@ -3,6 +3,7 @@ package controlplane import ( "context" "encoding/json" + "fmt" "log" "strings" "time" @@ -11,6 +12,7 @@ import ( "github.com/redis/go-redis/v9" "github.com/opensandbox/opensandbox/internal/db" + "github.com/opensandbox/opensandbox/pkg/types" pb "github.com/opensandbox/opensandbox/proto/worker" ) @@ -18,15 +20,16 @@ import ( // believes is stopped on this worker but the worker may still be hosting. // // Why this exists: -// When the worker is unreachable, the customer's DELETE goes through the -// cell-side fallback at internal/api/sandbox.go's destroy handler — the -// cell marks the session stopped in PG and publishes a `stopped` lifecycle -// event "to close the drift" with D1. Worker never receives the gRPC -// Destroy. When the worker becomes reachable again, the cell already -// thinks the sandbox is dead, but the worker still has m.vms[id] alive, -// qemu still running, usage_ticker still emitting `usage_tick` events. -// That window has run for 74h+ in the wild before a worker restart finally -// cleared the m.vms map. +// +// When the worker is unreachable, the customer's DELETE goes through the +// cell-side fallback at internal/api/sandbox.go's destroy handler — the +// cell marks the session stopped in PG and publishes a `stopped` lifecycle +// event "to close the drift" with D1. Worker never receives the gRPC +// Destroy. When the worker becomes reachable again, the cell already +// thinks the sandbox is dead, but the worker still has m.vms[id] alive, +// qemu still running, usage_ticker still emitting `usage_tick` events. +// That window has run for 74h+ in the wild before a worker restart finally +// cleared the m.vms map. // // This reconcile closes that gap: on worker rejoin (RedisWorkerRegistry's // OnWorkerRejoined callback), the cell sweeps every sandbox it has marked @@ -102,30 +105,32 @@ func isSandboxNotFound(err error) bool { // ReconcileRunningOnWorker is the symmetric direction of ReconcileStoppedOnWorker: // -// forward — cell says STOPPED on this worker, worker may still be hosting → -// re-issue Destroy via RPC. ReconcileStoppedOnWorker above. -// reverse — cell says RUNNING on this worker, worker doesn't have it → -// close the row on the cell side. THIS function. +// forward — cell says STOPPED on this worker, worker may still be hosting → +// re-issue Destroy via RPC. ReconcileStoppedOnWorker above. +// reverse — cell says RUNNING on this worker, worker doesn't have it → +// close the row on the cell side. THIS function. // // Why both directions are needed: -// The cell-side fallback at internal/api/sandbox.go (worker-unreachable -// destroy path) covers the forward direction. There's no symmetric fallback -// for "worker died, never finished EndScaleEvent on its way out" — when a -// worker process crashes/OOMs/restarts, its m.vms is cleared but cell PG -// keeps the scale event open. usage-reporter sums (now - started_at) -// indefinitely; customer gets billed for compute that hasn't run for days. +// +// The cell-side fallback at internal/api/sandbox.go (worker-unreachable +// destroy path) covers the forward direction. There's no symmetric fallback +// for "worker died, never finished EndScaleEvent on its way out" — when a +// worker process crashes/OOMs/restarts, its m.vms is cleared but cell PG +// keeps the scale event open. usage-reporter sums (now - started_at) +// indefinitely; customer gets billed for compute that hasn't run for days. // // Empirical fingerprint from prod: 49 still-open scale events on two workers // that were known to have restarted ~3-5 days prior, accumulating ~$2k of // phantom Pro billing per restart event. // // Process: -// 1. Ask cell PG: what sandboxes are status='running' on this worker? -// 2. Ask worker (existing ListSandboxes RPC): what do you actually have? -// 3. For each cell-PG-running entry the worker doesn't claim: -// - UpdateSandboxSessionStatus → stopped -// - EndScaleEvent → closes the open billing row -// - publish stopped lifecycle event so events-ingest updates D1 +// 1. Ask cell PG: what sandboxes are status='running' on this worker? +// 2. Ask worker (existing ListSandboxes RPC): what do you actually have? +// 3. For each cell-PG-running entry the worker doesn't claim: +// - resumable: recreate from the shared cell disk on an eligible worker +// and move routing to that worker +// - non-resumable: UpdateSandboxSessionStatus → stopped, EndScaleEvent, +// publish stopped lifecycle event so events-ingest updates D1 func ReconcileRunningOnWorker(ctx context.Context, registry *RedisWorkerRegistry, store *db.Store, cellID, workerID string) { if store == nil || registry == nil { return @@ -162,12 +167,18 @@ func ReconcileRunningOnWorker(ctx context.Context, registry *RedisWorkerRegistry log.Printf("controlplane: reverse-reconcile %s: cell-running=%d worker-has=%d", workerID, len(cellRunning), len(workerHas)) reason := "reverse_reconcile_worker_lost_session" - var closed, alive int + var closed, alive, recreated int for _, ref := range cellRunning { if _, has := workerHas[ref.SandboxID]; has { alive++ continue } + if ok, err := recreateResumableSandbox(ctx, registry, store, cellID, ref.SandboxID, workerID); err != nil { + log.Printf("controlplane: reverse-reconcile %s: resumable recreate %s failed: %v", workerID, ref.SandboxID, err) + } else if ok { + recreated++ + continue + } // Close the cell-side state. Order matters: status first (so future // dashboard reads see the right thing immediately), then scale event // (so usage-reporter stops billing), then the lifecycle event (so D1 @@ -187,7 +198,81 @@ func ReconcileRunningOnWorker(ctx context.Context, registry *RedisWorkerRegistry publishStoppedLifecycleEvent(ctx, registry.RedisClient(), cellID, ref.SandboxID, ref.OrgID.String(), workerID, reason) closed++ } - log.Printf("controlplane: reverse-reconcile %s: closed=%d still-alive-on-worker=%d (of %d cell-running)", workerID, closed, alive, len(cellRunning)) + log.Printf("controlplane: reverse-reconcile %s: recreated=%d closed=%d still-alive-on-worker=%d (of %d cell-running)", workerID, recreated, closed, alive, len(cellRunning)) +} + +func recreateResumableSandbox(ctx context.Context, registry *RedisWorkerRegistry, store *db.Store, cellID, sandboxID, oldWorkerID string) (bool, error) { + session, err := store.GetSandboxSession(ctx, sandboxID) + if err != nil { + return false, err + } + if session.Status != "running" { + return false, nil + } + var cfg types.SandboxConfig + if len(session.Config) > 0 { + if err := json.Unmarshal(session.Config, &cfg); err != nil { + return false, fmt.Errorf("parse sandbox config: %w", err) + } + } + if !cfg.IsResumable() { + return false, nil + } + cfg.EnsureNetworkEnabledDefault() + cfg.SandboxID = sandboxID + if cfg.Envs == nil { + cfg.Envs = map[string]string{} + } + cfg.Envs["OPENSANDBOX_RESUMABLE"] = "true" + cfg.Envs["OPENSANDBOX_RESUME_NOTICE_SECONDS"] = "25" + + worker, client, err := registry.GetLeastLoadedWorker(session.Region) + if err != nil { + return true, fmt.Errorf("pick worker: %w", err) + } + + rpcCtx, cancel := context.WithTimeout(ctx, 60*time.Second) + resp, err := client.CreateSandbox(rpcCtx, &pb.CreateSandboxRequest{ + SandboxId: sandboxID, + Template: cfg.Template, + Timeout: int32(cfg.Timeout), + Envs: cfg.Envs, + MemoryMb: int32(cfg.MemoryMB), + CpuCount: int32(cfg.CpuCount), + NetworkEnabled: cfg.IsNetworkEnabled(), + Port: int32(cfg.Port), + EgressAllowlist: cfg.EgressAllowlist, + SecretAllowedHosts: flattenSecretAllowedHostsForReconcile(cfg.SecretAllowedHosts), + SecretEnvs: cfg.SecretEnvs, + DiskMb: int32(cfg.DiskMB), + }) + cancel() + if err != nil { + return true, fmt.Errorf("worker CreateSandbox on %s: %w", worker.ID, err) + } + if resp == nil || resp.SandboxId == "" { + return true, fmt.Errorf("worker CreateSandbox on %s returned empty response", worker.ID) + } + if err := store.CompleteMigration(ctx, sandboxID, worker.ID); err != nil { + return true, fmt.Errorf("update session worker: %w", err) + } + if worker.GoldenVersion != "" { + _ = store.SetSandboxGoldenVersion(ctx, sandboxID, worker.GoldenVersion) + } + PublishLifecycle(ctx, registry.RedisClient(), cellID, "migrated", sandboxID, worker.ID, session.OrgID, "resumable_recreate") + log.Printf("controlplane: resumable recreate %s: %s -> %s", sandboxID, oldWorkerID, worker.ID) + return true, nil +} + +func flattenSecretAllowedHostsForReconcile(m map[string][]string) map[string]string { + if len(m) == 0 { + return nil + } + out := make(map[string]string, len(m)) + for k, hosts := range m { + out[k] = strings.Join(hosts, ",") + } + return out } // publishStoppedLifecycleEvent emits a `stopped` event onto this cell's events diff --git a/internal/controlplane/redis_registry.go b/internal/controlplane/redis_registry.go index 30da6b4b..6f9c5f82 100644 --- a/internal/controlplane/redis_registry.go +++ b/internal/controlplane/redis_registry.go @@ -338,6 +338,7 @@ func (r *RedisWorkerRegistry) handleHeartbeat(entry WorkerEntry) { existing.TotalMemoryMB = entry.TotalMemoryMB existing.CommittedMemoryMB = entry.CommittedMemoryMB existing.Draining = drainOverride + existing.Sandboxes = entry.Sandboxes if entry.GoldenVersion != "" { existing.GoldenVersion = entry.GoldenVersion } diff --git a/internal/controlplane/scaler.go b/internal/controlplane/scaler.go index 8c150fd6..a35dd8d4 100644 --- a/internal/controlplane/scaler.go +++ b/internal/controlplane/scaler.go @@ -359,11 +359,46 @@ func (s *Scaler) evaluate() { regions = poolRegions } } + s.recreateResumableOnDeadWorkers(ctx) for _, region := range regions { s.evaluateRegion(ctx, region) } } +func (s *Scaler) recreateResumableOnDeadWorkers(ctx context.Context) { + if s.store == nil { + return + } + redisRegistry, ok := s.registry.(*RedisWorkerRegistry) + if !ok { + return + } + liveWorkers := make(map[string]bool) + for _, region := range s.registry.Regions() { + for _, w := range s.registry.GetWorkersByRegion(region) { + liveWorkers[w.ID] = true + } + } + if len(liveWorkers) == 0 { + return + } + deadRefs, err := s.store.ListRunningSandboxesOnDeadWorkers(ctx, liveWorkers) + if err != nil { + log.Printf("scaler: resumable dead-worker scan failed: %v", err) + return + } + for _, ref := range deadRefs { + ok, err := recreateResumableSandbox(ctx, redisRegistry, s.store, s.cellID, ref.SandboxID, ref.WorkerID) + if err != nil { + log.Printf("scaler: resumable recreate %s from dead worker %s failed: %v", ref.SandboxID, ref.WorkerID, err) + continue + } + if ok { + log.Printf("scaler: resumable recreate %s from dead worker %s succeeded", ref.SandboxID, ref.WorkerID) + } + } +} + func (s *Scaler) evaluateRegion(ctx context.Context, region string) { s.mu.Lock() defer s.mu.Unlock() diff --git a/internal/db/store.go b/internal/db/store.go index 2d5de90a..09e904fa 100644 --- a/internal/db/store.go +++ b/internal/db/store.go @@ -988,6 +988,7 @@ func (s *Store) ListSandboxSessions(ctx context.Context, orgID uuid.UUID, status type WorkerSandboxRef struct { SandboxID string OrgID uuid.UUID + WorkerID string } // ListSandboxesByWorkerStatus returns the (sandbox_id, org_id) pairs for sessions @@ -1017,6 +1018,33 @@ func (s *Store) ListSandboxesByWorkerStatus(ctx context.Context, workerID, statu return refs, rows.Err() } +// ListRunningSandboxesOnDeadWorkers returns running sessions whose worker_id is +// no longer present in the live worker set. It is read-only; callers decide +// whether to recover or close each row. +func (s *Store) ListRunningSandboxesOnDeadWorkers(ctx context.Context, liveWorkers map[string]bool) ([]WorkerSandboxRef, error) { + rows, err := s.pool.Query(ctx, + `SELECT sandbox_id, org_id, worker_id FROM sandbox_sessions + WHERE status = 'running' + ORDER BY started_at DESC + LIMIT 1000`) + if err != nil { + return nil, fmt.Errorf("list running sandboxes: %w", err) + } + defer rows.Close() + var refs []WorkerSandboxRef + for rows.Next() { + var r WorkerSandboxRef + if err := rows.Scan(&r.SandboxID, &r.OrgID, &r.WorkerID); err != nil { + return nil, err + } + if liveWorkers[r.WorkerID] { + continue + } + refs = append(refs, r) + } + return refs, rows.Err() +} + // ListSandboxIDsByWorkerStatus returns sandbox IDs (the public sb-... form, // not the row UUID) for sessions on a specific worker with a specific status. // Used by the reconcile-on-reconnect sweep: when a worker rejoins after being diff --git a/internal/qemu/manager.go b/internal/qemu/manager.go index 957f28fa..19ded6e5 100644 --- a/internal/qemu/manager.go +++ b/internal/qemu/manager.go @@ -1658,8 +1658,13 @@ func (m *Manager) Create(ctx context.Context, cfg types.SandboxConfig) (sb *type id = "sb-" + uuid.New().String()[:8] } + sandboxDir := filepath.Join(m.cfg.DataDir, "sandboxes", id) + rootfsPath := filepath.Join(sandboxDir, "rootfs.qcow2") + workspacePath := filepath.Join(sandboxDir, "workspace.qcow2") + reuseExistingDrives := cfg.IsResumable() && fileExists(rootfsPath) && fileExists(workspacePath) + // Fast path: restore from golden snapshot if available and using default template - if m.goldenDir != "" && template == "default" && cfg.TemplateRootfsKey == "" { + if !reuseExistingDrives && m.goldenDir != "" && template == "default" && cfg.TemplateRootfsKey == "" { sb, err := m.createFromGolden(ctx, cfg, id) if err != nil { log.Printf("qemu: golden restore failed for %s, falling back to cold boot: %v", id, err) @@ -1669,15 +1674,13 @@ func (m *Manager) Create(ctx context.Context, cfg types.SandboxConfig) (sb *type } } - sandboxDir := filepath.Join(m.cfg.DataDir, "sandboxes", id) if err := os.MkdirAll(sandboxDir, 0755); err != nil { return nil, fmt.Errorf("mkdir sandbox dir: %w", err) } - rootfsPath := filepath.Join(sandboxDir, "rootfs.qcow2") - workspacePath := filepath.Join(sandboxDir, "workspace.qcow2") - - if cfg.TemplateRootfsKey != "" { + if reuseExistingDrives { + log.Printf("qemu: create %s: reusing existing resumable drives", id) + } else if cfg.TemplateRootfsKey != "" { srcRootfs := strings.TrimPrefix(cfg.TemplateRootfsKey, "local://") srcWorkspace := strings.TrimPrefix(cfg.TemplateWorkspaceKey, "local://") log.Printf("qemu: create %s from snapshot template (rootfs=%s, workspace=%s)", id, srcRootfs, srcWorkspace) diff --git a/internal/qemu/migration.go b/internal/qemu/migration.go index afb74313..3dfc597f 100644 --- a/internal/qemu/migration.go +++ b/internal/qemu/migration.go @@ -627,6 +627,11 @@ func (m *Manager) CompleteIncomingMigration(ctx context.Context, sandboxID strin // type so the worker gRPC layer can refer to it without an import cycle. func (m *Manager) PreCopyDrives(ctx context.Context, sandboxID string, checkpointStore *storage.CheckpointStore) (rootfsKey, workspaceKey, goldenVer string, baseCPU, baseMem, actualMem int, secrets sandbox.MigrationSecrets, err error) { + if checkpointStore == nil { + err = fmt.Errorf("checkpoint store not configured") + return + } + m.mu.RLock() vm, exists := m.vms[sandboxID] var pid int @@ -813,6 +818,10 @@ func (m *Manager) LiveMigrate(ctx context.Context, sandboxID, incomingAddr strin // uploadFile uploads a local file to S3. func (mc *MigrationCoordinator) uploadFile(ctx context.Context, localPath, s3Key string) (int64, error) { + if mc.checkpointStore == nil { + return 0, fmt.Errorf("checkpoint store not configured") + } + info, err := os.Stat(localPath) if err != nil { return 0, err diff --git a/internal/worker/grpc_server.go b/internal/worker/grpc_server.go index 87bbfeaf..4a7a7b0e 100644 --- a/internal/worker/grpc_server.go +++ b/internal/worker/grpc_server.go @@ -220,6 +220,10 @@ func (s *GRPCServer) CreateSandbox(ctx context.Context, req *pb.CreateSandboxReq SecretEnvs: req.SecretEnvs, DiskMB: int(req.DiskMb), } + if cfg.Envs["OPENSANDBOX_RESUMABLE"] == "true" { + cfg.Resumable = true + cfg.SandboxFamily = types.SandboxFamilySpot + } // Warm fork: if checkpoint_id is set, fork from the local checkpoint cache. // ForkFromCheckpoint uses the local cache directly — no S3 needed. diff --git a/pkg/types/sandbox.go b/pkg/types/sandbox.go index 39932462..8988ecb9 100644 --- a/pkg/types/sandbox.go +++ b/pkg/types/sandbox.go @@ -34,6 +34,7 @@ type Sandbox struct { CpuCount int `json:"cpuCount"` MemoryMB int `json:"memoryMB"` SandboxFamily string `json:"sandboxFamily,omitempty"` + Resumable bool `json:"resumable,omitempty"` MachineID string `json:"machineID,omitempty"` // ConnectURL and Token are currently unused by SDKs. All data-plane traffic // flows through the control plane's SandboxAPIProxy, which proxies to workers @@ -52,12 +53,16 @@ type SandboxConfig struct { CpuCount int `json:"cpuCount,omitempty"` // default 1 MemoryMB int `json:"memoryMB,omitempty"` // default 256 DiskMB int `json:"diskMB,omitempty"` // workspace disk in MB (default 20480) - // SandboxFamily selects an alpha placement/resource family. Empty is the - // default on-demand family. "spot" routes through spot-only capacity and is - // currently restricted to 1 vCPU / 1024 MB with scaling disabled. - SandboxFamily string `json:"sandboxFamily,omitempty"` - Envs map[string]string `json:"envs,omitempty"` - Port int `json:"port,omitempty"` // container port to expose via subdomain (default 80) + // SandboxFamily selects an internal placement family. Empty is the default + // on-demand family. "spot" routes through resumable spare-capacity workers. + SandboxFamily string `json:"sandboxFamily,omitempty"` + // Resumable selects the alpha lower-cost resumable sandbox tier. Resumable + // sandboxes preserve disk across infrastructure restarts but do not + // guarantee running process or memory survival. Internally this maps to the + // spot placement family while that tier is backed by spare cloud capacity. + Resumable bool `json:"resumable,omitempty"` + Envs map[string]string `json:"envs,omitempty"` + Port int `json:"port,omitempty"` // container port to expose via subdomain (default 80) // NetworkEnabled is a pointer so we can distinguish "unset" from // "explicitly false". Unset defaults to true (see IsNetworkEnabled). NetworkEnabled *bool `json:"networkEnabled,omitempty"` @@ -201,25 +206,23 @@ func ValidateResourceTier(cfg *SandboxConfig) error { // ApplySandboxFamilyDefaultsAndValidate normalizes alpha sandbox-family options // before regular resource-tier validation. func ApplySandboxFamilyDefaultsAndValidate(cfg *SandboxConfig) error { + if cfg.Resumable { + if cfg.SandboxFamily == "" || cfg.SandboxFamily == "default" { + cfg.SandboxFamily = SandboxFamilySpot + } + } switch cfg.SandboxFamily { case SandboxFamilyDefault: return nil case "default": cfg.SandboxFamily = SandboxFamilyDefault return nil + case "resumable": + cfg.SandboxFamily = SandboxFamilySpot + cfg.Resumable = true + return ApplySandboxFamilyDefaultsAndValidate(cfg) case SandboxFamilySpot: - if len(cfg.ImageManifest) > 0 || cfg.Snapshot != "" { - return fmt.Errorf("sandboxFamily %q does not support image or snapshot creates in alpha", SandboxFamilySpot) - } - if cfg.MemoryMB == 0 { - cfg.MemoryMB = 1024 - } - if cfg.CpuCount == 0 { - cfg.CpuCount = 1 - } - if cfg.MemoryMB != 1024 || cfg.CpuCount != 1 { - return fmt.Errorf("sandboxFamily %q is currently limited to 1 vCPU and 1024 MB memory", SandboxFamilySpot) - } + cfg.Resumable = true return nil default: return fmt.Errorf("unsupported sandboxFamily %q", cfg.SandboxFamily) @@ -230,6 +233,10 @@ func (c SandboxConfig) IsSpotFamily() bool { return c.SandboxFamily == SandboxFamilySpot } +func (c SandboxConfig) IsResumable() bool { + return c.Resumable || c.SandboxFamily == SandboxFamilySpot +} + // SandboxListResponse is the response for listing sandboxes. type SandboxListResponse struct { Sandboxes []Sandbox `json:"sandboxes"` diff --git a/pkg/types/sandbox_family_test.go b/pkg/types/sandbox_family_test.go index 41834b92..64103f03 100644 --- a/pkg/types/sandbox_family_test.go +++ b/pkg/types/sandbox_family_test.go @@ -1,45 +1,54 @@ package types import ( - "encoding/json" "strings" "testing" ) -func TestApplySandboxFamilyDefaultsAndValidateSpotDefaultsToSmallestTier(t *testing.T) { +func TestApplySandboxFamilyDefaultsAndValidateSpotMarksResumable(t *testing.T) { cfg := SandboxConfig{SandboxFamily: SandboxFamilySpot} if err := ApplySandboxFamilyDefaultsAndValidate(&cfg); err != nil { t.Fatalf("unexpected error: %v", err) } - if cfg.CpuCount != 1 || cfg.MemoryMB != 1024 { - t.Fatalf("expected spot defaults 1 cpu / 1024MB, got cpu=%d memory=%d", cfg.CpuCount, cfg.MemoryMB) + if !cfg.Resumable { + t.Fatalf("expected internal spot family to mark sandbox resumable") } - if err := ValidateResourceTier(&cfg); err != nil { - t.Fatalf("spot defaults should be a valid tier: %v", err) +} + +func TestApplySandboxFamilyDefaultsAndValidateResumableFlagMapsToSpot(t *testing.T) { + cfg := SandboxConfig{Resumable: true} + + if err := ApplySandboxFamilyDefaultsAndValidate(&cfg); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if cfg.SandboxFamily != SandboxFamilySpot { + t.Fatalf("expected resumable to map to internal spot family, got %q", cfg.SandboxFamily) + } + if cfg.CpuCount != 0 || cfg.MemoryMB != 0 { + t.Fatalf("expected resumable not to force resources, got cpu=%d memory=%d", cfg.CpuCount, cfg.MemoryMB) } } -func TestApplySandboxFamilyDefaultsAndValidateSpotRejectsLargerTier(t *testing.T) { - cfg := SandboxConfig{SandboxFamily: SandboxFamilySpot, CpuCount: 1, MemoryMB: 4096} +func TestApplySandboxFamilyDefaultsAndValidateResumableFamilyAlias(t *testing.T) { + cfg := SandboxConfig{SandboxFamily: "resumable"} - err := ApplySandboxFamilyDefaultsAndValidate(&cfg) - if err == nil || !strings.Contains(err.Error(), "limited to 1 vCPU and 1024 MB") { - t.Fatalf("expected spot size rejection, got %v", err) + if err := ApplySandboxFamilyDefaultsAndValidate(&cfg); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if cfg.SandboxFamily != SandboxFamilySpot || !cfg.Resumable { + t.Fatalf("expected resumable alias to map to internal spot family, got family=%q resumable=%v", cfg.SandboxFamily, cfg.Resumable) } } -func TestApplySandboxFamilyDefaultsAndValidateSpotRejectsSnapshotAndImage(t *testing.T) { - for name, cfg := range map[string]SandboxConfig{ - "snapshot": {SandboxFamily: SandboxFamilySpot, Snapshot: "snap"}, - "image": {SandboxFamily: SandboxFamilySpot, ImageManifest: json.RawMessage(`{"steps":[]}`)}, - } { - t.Run(name, func(t *testing.T) { - err := ApplySandboxFamilyDefaultsAndValidate(&cfg) - if err == nil || !strings.Contains(err.Error(), "does not support image or snapshot") { - t.Fatalf("expected image/snapshot rejection, got %v", err) - } - }) +func TestApplySandboxFamilyDefaultsAndValidateResumableAllowsLargerTier(t *testing.T) { + cfg := SandboxConfig{SandboxFamily: SandboxFamilySpot, CpuCount: 1, MemoryMB: 4096} + + if err := ApplySandboxFamilyDefaultsAndValidate(&cfg); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if err := ValidateResourceTier(&cfg); err != nil { + t.Fatalf("expected larger resumable tier to validate normally: %v", err) } } diff --git a/sdks/python/opencomputer/sandbox.py b/sdks/python/opencomputer/sandbox.py index 077044d3..4f7f35de 100644 --- a/sdks/python/opencomputer/sandbox.py +++ b/sdks/python/opencomputer/sandbox.py @@ -34,9 +34,8 @@ class PlanLimitError(Exception): class SandboxFamilyLimitError(Exception): - """Raised when a resource-changing call is blocked by the sandbox's - family. Alpha spot sandboxes are fixed at 1 vCPU / 1024 MB and cannot be - scaled. + """Raised when a resource-changing call is blocked by a sandbox family. + Kept for compatibility with older API responses. """ code = "sandbox_family_scale_disabled" @@ -86,6 +85,7 @@ async def create( api_url: str | None = None, envs: dict[str, str] | None = None, metadata: dict[str, str] | None = None, + resumable: bool | None = None, sandbox_family: str | None = None, disk_mb: int | None = None, memory_mb: int | None = None, @@ -103,8 +103,10 @@ async def create( api_url: API URL (or OPENCOMPUTER_API_URL env var). envs: Environment variables to inject. Overrides store secrets. metadata: Custom metadata key-value pairs. - sandbox_family: Alpha placement/resource family. ``"spot"`` uses - spot-only capacity and is limited to 1 vCPU / 1024 MB. + resumable: Create a resumable sandbox. Disk is preserved across + infrastructure restarts; processes may restart. + sandbox_family: Internal/legacy placement family. Prefer + ``resumable=True`` for public API usage. disk_mb: Workspace disk size in MB (default 20480 = 20GB). Any additional GB above 20GB is metered at a per-second rate comparable to EBS gp3. Closed beta: requests above 20GB @@ -148,6 +150,8 @@ async def create( body["envs"] = envs if metadata: body["metadata"] = metadata + if resumable is not None: + body["resumable"] = resumable if sandbox_family: body["sandboxFamily"] = sandbox_family if disk_mb is not None: diff --git a/sdks/typescript/src/sandbox.ts b/sdks/typescript/src/sandbox.ts index 8919bdfd..c627fb7b 100644 --- a/sdks/typescript/src/sandbox.ts +++ b/sdks/typescript/src/sandbox.ts @@ -13,7 +13,9 @@ function resolveApiUrl(url: string): string { export interface SandboxOpts { template?: string; - /** Alpha placement/resource family. "spot" uses spot-only capacity and is limited to 1 vCPU / 1024 MB. */ + /** Create a resumable sandbox. Disk is preserved across infrastructure restarts; processes may restart. */ + resumable?: boolean; + /** Internal/legacy placement family. Prefer `resumable: true` for public API usage. */ sandboxFamily?: "spot"; /** * Idle timeout in seconds after which the sandbox auto-hibernates. @@ -49,6 +51,7 @@ interface SandboxData { status: string; templateID?: string; sandboxFamily?: string; + resumable?: boolean; connectURL?: string; token?: string; sandboxDomain?: string; @@ -122,7 +125,7 @@ export class PlanLimitError extends Error { /** * Thrown when a resource-changing call is blocked by the sandbox's family. - * Alpha spot sandboxes are fixed at 1 vCPU / 1024 MB and cannot be scaled. + * Kept for compatibility with older API responses. */ export class SandboxFamilyLimitError extends Error { readonly code = "sandbox_family_scale_disabled"; @@ -263,6 +266,7 @@ export class Sandbox { }; if (opts.envs) body.envs = opts.envs; if (opts.metadata) body.metadata = opts.metadata; + if (opts.resumable != null) body.resumable = opts.resumable; if (opts.sandboxFamily) body.sandboxFamily = opts.sandboxFamily; if (opts.cpuCount != null) body.cpuCount = opts.cpuCount; if (opts.memoryMB != null) body.memoryMB = opts.memoryMB; From e2f4854984d4791372e87949a132434424a43e44 Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Wed, 3 Jun 2026 22:10:41 -0700 Subject: [PATCH 16/32] Update resumable sandbox reliability docs --- docs/sandboxes/resumable-sandboxes.mdx | 129 ++++++++++++++++++++----- 1 file changed, 107 insertions(+), 22 deletions(-) diff --git a/docs/sandboxes/resumable-sandboxes.mdx b/docs/sandboxes/resumable-sandboxes.mdx index 21088307..bd6f03cf 100644 --- a/docs/sandboxes/resumable-sandboxes.mdx +++ b/docs/sandboxes/resumable-sandboxes.mdx @@ -108,35 +108,120 @@ During alpha, resumable sandboxes have these restrictions: ## Reliability Pattern -For agent or batch systems, treat a resumable sandbox as durable-disk, restartable compute: +For agent or batch systems, run your work from a small process wrapper and install a restart-notice hook. 1. Store task state on disk or outside the sandbox. 2. Write outputs to durable storage as the task progresses. -3. Keep long commands idempotent so restarts are safe. -4. On disconnect, reconnect and check the sandbox before retrying the task. -5. Use on-demand sandboxes for workloads with no interruption budget. +3. Install `/home/sandbox/.opencomputer/on-restart-notice` to flush or checkpoint before restart. +4. Make your startup command resume from the last durable state. +5. Use on-demand sandboxes for workloads with no process restart budget. + +Example setup: + + ```typescript TypeScript -async function runRetryableTask(command: string) { - for (let attempt = 1; attempt <= 3; attempt++) { - const sandbox = await Sandbox.create({ - resumable: true, - timeout: 300, - metadata: { workload: "retryable-batch", attempt: String(attempt) }, - }); - - try { - const result = await sandbox.exec.run(command, { timeout: 120 }); - return result.stdout; - } finally { - await sandbox.kill().catch(() => {}); - } - } - - throw new Error("task failed after resumable retries"); -} +import { Sandbox } from "@opencomputer/sdk"; + +const sandbox = await Sandbox.create({ + resumable: true, + timeout: 0, +}); + +await sandbox.exec.run("mkdir -p /home/sandbox/.opencomputer /home/sandbox/app"); + +await sandbox.files.write( + "/home/sandbox/.opencomputer/on-restart-notice", + `#!/bin/sh +set -eu +echo "restart notice: ${1:-25}s" >> /home/sandbox/app/events.log +touch /home/sandbox/app/restarting +sync +`, +); + +await sandbox.exec.run("chmod +x /home/sandbox/.opencomputer/on-restart-notice"); + +await sandbox.files.write( + "/home/sandbox/app/worker.sh", + `#!/bin/sh +set -eu +cd /home/sandbox/app +if [ -f restarting ]; then + echo "resumed after restart" >> events.log + rm -f restarting +else + echo "started" >> events.log +fi + +i=$(cat counter 2>/dev/null || echo 0) +while true; do + i=$((i + 1)) + echo "$i" > counter + sync + sleep 5 +done +`, +); + +await sandbox.exec.run("chmod +x /home/sandbox/app/worker.sh"); +await sandbox.exec.background("/home/sandbox/app/worker.sh", { + maxRunAfterDisconnect: 0, +}); ``` +```python Python +from opencomputer import Sandbox + +sandbox = await Sandbox.create(resumable=True, timeout=0) + +await sandbox.exec.run("mkdir -p /home/sandbox/.opencomputer /home/sandbox/app") + +await sandbox.files.write( + "/home/sandbox/.opencomputer/on-restart-notice", + """#!/bin/sh +set -eu +echo "restart notice: ${1:-25}s" >> /home/sandbox/app/events.log +touch /home/sandbox/app/restarting +sync +""", +) + +await sandbox.exec.run("chmod +x /home/sandbox/.opencomputer/on-restart-notice") + +await sandbox.files.write( + "/home/sandbox/app/worker.sh", + """#!/bin/sh +set -eu +cd /home/sandbox/app +if [ -f restarting ]; then + echo "resumed after restart" >> events.log + rm -f restarting +else + echo "started" >> events.log +fi + +i=$(cat counter 2>/dev/null || echo 0) +while true; do + i=$((i + 1)) + echo "$i" > counter + sync + sleep 5 +done +""", +) + +await sandbox.exec.run("chmod +x /home/sandbox/app/worker.sh") +await sandbox.exec.background( + "/home/sandbox/app/worker.sh", + max_run_after_disconnect=0, +) +``` + + + +When OpenComputer receives notice, it runs the hook with the notice window in seconds. If the sandbox resumes on another worker, start your process again and read the state you wrote under `/home/sandbox/app`. + ## Related From 9bf21fda31dcdf930e8f0f1311372fa8244600c3 Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Thu, 4 Jun 2026 18:02:16 -0700 Subject: [PATCH 17/32] Configure worker-local golden cache --- cmd/worker/main.go | 1 + internal/config/config.go | 90 ++++++++++++++++++++------------------- internal/qemu/manager.go | 19 +++++++-- 3 files changed, 62 insertions(+), 48 deletions(-) diff --git a/cmd/worker/main.go b/cmd/worker/main.go index 8d947096..ade6d501 100644 --- a/cmd/worker/main.go +++ b/cmd/worker/main.go @@ -237,6 +237,7 @@ func main() { DataDir: cfg.DataDir, KernelPath: cfg.KernelPath, ImagesDir: cfg.ImagesDir, + GoldenDir: cfg.GoldenDir, QEMUBin: cfg.QEMUBin, AgentBinaryPath: "/usr/local/bin/osb-agent", AgentVersion: AgentVersion, diff --git a/internal/config/config.go b/internal/config/config.go index b78ba625..b8a7f4a5 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -102,28 +102,29 @@ type Config struct { // QEMU VM configuration (worker mode) KernelPath string // Path to vmlinux kernel ImagesDir string // Path to base rootfs images + GoldenDir string // Path to worker-local golden VM snapshot cache QEMUBin string // Path to qemu-system binary (default: "qemu-system-x86_64") // AWS EC2 compute pool (server mode only — for auto-scaling worker machines) - EC2AMI string // Custom AMI for worker instances - EC2InstanceType string // single fallback type; used only when EC2InstanceTypes is empty - EC2InstanceTypes []string // ranked list of instance types tried in order on quota/capacity errors - EC2SubnetID string // VPC subnet for worker instances - EC2SecurityGroupID string // Security group (allow 8080, 9090, 9091) - EC2KeyName string // SSH key pair name (for debugging) - EC2WorkerImage string // Docker image for containerized workers - EC2IAMInstanceProfile string // IAM instance profile for worker instances (Secrets Manager + S3) - EC2SSMParameterName string // SSM parameter name for dynamic AMI ID (e.g. /opensandbox/prod/worker-ami-id) + EC2AMI string // Custom AMI for worker instances + EC2InstanceType string // single fallback type; used only when EC2InstanceTypes is empty + EC2InstanceTypes []string // ranked list of instance types tried in order on quota/capacity errors + EC2SubnetID string // VPC subnet for worker instances + EC2SecurityGroupID string // Security group (allow 8080, 9090, 9091) + EC2KeyName string // SSH key pair name (for debugging) + EC2WorkerImage string // Docker image for containerized workers + EC2IAMInstanceProfile string // IAM instance profile for worker instances (Secrets Manager + S3) + EC2SSMParameterName string // SSM parameter name for dynamic AMI ID (e.g. /opensandbox/prod/worker-ami-id) // Azure compute pool (server mode — for auto-scaling worker VMs) - AzureSubscriptionID string // Azure subscription ID - AzureResourceGroup string // Resource group for worker VMs - AzureVMSize string // single fallback size; used only when AzureVMSizes is empty + AzureSubscriptionID string // Azure subscription ID + AzureResourceGroup string // Resource group for worker VMs + AzureVMSize string // single fallback size; used only when AzureVMSizes is empty AzureVMSizes []string // ranked list of VM sizes tried in order on quota/capacity errors - AzureImageID string // Custom image ID or URN - AzureSubnetID string // Full resource ID of the VNet subnet - AzureSSHPublicKey string // SSH public key for worker VMs - AzureKeyVaultName string // Key Vault name for dynamic image ID refresh (e.g. "opensandbox-prod") + AzureImageID string // Custom image ID or URN + AzureSubnetID string // Full resource ID of the VNet subnet + AzureSSHPublicKey string // SSH public key for worker VMs + AzureKeyVaultName string // Key Vault name for dynamic image ID refresh (e.g. "opensandbox-prod") // AzureWorkerIdentityID is the full resource ID of a UserAssigned managed // identity to attach to every worker VM. The identity must already have // "Key Vault Secrets Officer" on the regional KV so workers can fetch @@ -313,7 +314,7 @@ func Load() (*Config, error) { WorkOSCookieDomain: os.Getenv("WORKOS_COOKIE_DOMAIN"), WorkOSFrontendURL: os.Getenv("WORKOS_FRONTEND_URL"), - RedisURL: os.Getenv("OPENSANDBOX_REDIS_URL"), + RedisURL: os.Getenv("OPENSANDBOX_REDIS_URL"), MaxCapacity: envOrDefaultInt("OPENSANDBOX_MAX_CAPACITY", 50), @@ -339,31 +340,32 @@ func Load() (*Config, error) { DefaultSandboxCPUs: envOrDefaultInt("OPENSANDBOX_DEFAULT_SANDBOX_CPUS", 1), DefaultSandboxDiskMB: envOrDefaultInt("OPENSANDBOX_DEFAULT_SANDBOX_DISK_MB", 0), - KernelPath: os.Getenv("OPENSANDBOX_KERNEL_PATH"), // default derived from DataDir - ImagesDir: os.Getenv("OPENSANDBOX_IMAGES_DIR"), // default derived from DataDir - QEMUBin: envOrDefault("OPENSANDBOX_QEMU_BIN", "qemu-system-x86_64"), - - EC2AMI: os.Getenv("OPENSANDBOX_EC2_AMI"), - EC2InstanceType: envOrDefault("OPENSANDBOX_EC2_INSTANCE_TYPE", "c7gd.metal"), - EC2InstanceTypes: splitCSV(os.Getenv("OPENSANDBOX_EC2_INSTANCE_TYPES")), - EC2SubnetID: os.Getenv("OPENSANDBOX_EC2_SUBNET_ID"), - EC2SecurityGroupID: os.Getenv("OPENSANDBOX_EC2_SECURITY_GROUP_ID"), - EC2KeyName: os.Getenv("OPENSANDBOX_EC2_KEY_NAME"), - EC2WorkerImage: envOrDefault("OPENSANDBOX_EC2_WORKER_IMAGE", "opensandbox-worker:latest"), - EC2IAMInstanceProfile: os.Getenv("OPENSANDBOX_EC2_IAM_INSTANCE_PROFILE"), - EC2SSMParameterName: os.Getenv("OPENSANDBOX_EC2_SSM_AMI_PARAM"), - - AzureSubscriptionID: os.Getenv("OPENSANDBOX_AZURE_SUBSCRIPTION_ID"), - AzureResourceGroup: os.Getenv("OPENSANDBOX_AZURE_RESOURCE_GROUP"), - AzureVMSize: envOrDefault("OPENSANDBOX_AZURE_VM_SIZE", "Standard_D16s_v5"), - AzureVMSizes: splitCSV(os.Getenv("OPENSANDBOX_AZURE_VM_SIZES")), - AzureImageID: os.Getenv("OPENSANDBOX_AZURE_IMAGE_ID"), - AzureSubnetID: os.Getenv("OPENSANDBOX_AZURE_SUBNET_ID"), - AzureSSHPublicKey: os.Getenv("OPENSANDBOX_AZURE_SSH_PUBLIC_KEY"), - AzureKeyVaultName: os.Getenv("OPENSANDBOX_AZURE_KEY_VAULT_NAME"), - AWSSecretsPrefix: os.Getenv("OPENSANDBOX_AWS_SECRETS_PREFIX"), - Cloud: os.Getenv("OPENSANDBOX_CLOUD"), - CPUOvercommitRatio: envOrDefaultInt("OPENSANDBOX_CPU_OVERCOMMIT_RATIO", 1), + KernelPath: os.Getenv("OPENSANDBOX_KERNEL_PATH"), // default derived from DataDir + ImagesDir: os.Getenv("OPENSANDBOX_IMAGES_DIR"), // default derived from DataDir + GoldenDir: os.Getenv("OPENSANDBOX_GOLDEN_DIR"), // default derived from DataDir + QEMUBin: envOrDefault("OPENSANDBOX_QEMU_BIN", "qemu-system-x86_64"), + + EC2AMI: os.Getenv("OPENSANDBOX_EC2_AMI"), + EC2InstanceType: envOrDefault("OPENSANDBOX_EC2_INSTANCE_TYPE", "c7gd.metal"), + EC2InstanceTypes: splitCSV(os.Getenv("OPENSANDBOX_EC2_INSTANCE_TYPES")), + EC2SubnetID: os.Getenv("OPENSANDBOX_EC2_SUBNET_ID"), + EC2SecurityGroupID: os.Getenv("OPENSANDBOX_EC2_SECURITY_GROUP_ID"), + EC2KeyName: os.Getenv("OPENSANDBOX_EC2_KEY_NAME"), + EC2WorkerImage: envOrDefault("OPENSANDBOX_EC2_WORKER_IMAGE", "opensandbox-worker:latest"), + EC2IAMInstanceProfile: os.Getenv("OPENSANDBOX_EC2_IAM_INSTANCE_PROFILE"), + EC2SSMParameterName: os.Getenv("OPENSANDBOX_EC2_SSM_AMI_PARAM"), + + AzureSubscriptionID: os.Getenv("OPENSANDBOX_AZURE_SUBSCRIPTION_ID"), + AzureResourceGroup: os.Getenv("OPENSANDBOX_AZURE_RESOURCE_GROUP"), + AzureVMSize: envOrDefault("OPENSANDBOX_AZURE_VM_SIZE", "Standard_D16s_v5"), + AzureVMSizes: splitCSV(os.Getenv("OPENSANDBOX_AZURE_VM_SIZES")), + AzureImageID: os.Getenv("OPENSANDBOX_AZURE_IMAGE_ID"), + AzureSubnetID: os.Getenv("OPENSANDBOX_AZURE_SUBNET_ID"), + AzureSSHPublicKey: os.Getenv("OPENSANDBOX_AZURE_SSH_PUBLIC_KEY"), + AzureKeyVaultName: os.Getenv("OPENSANDBOX_AZURE_KEY_VAULT_NAME"), + AWSSecretsPrefix: os.Getenv("OPENSANDBOX_AWS_SECRETS_PREFIX"), + Cloud: os.Getenv("OPENSANDBOX_CLOUD"), + CPUOvercommitRatio: envOrDefaultInt("OPENSANDBOX_CPU_OVERCOMMIT_RATIO", 1), AzureWorkerIdentityID: os.Getenv("OPENSANDBOX_AZURE_WORKER_IDENTITY_ID"), CFAPIToken: os.Getenv("OPENSANDBOX_CF_API_TOKEN"), @@ -377,8 +379,8 @@ func Load() (*Config, error) { StripeSecretKey: os.Getenv("STRIPE_SECRET_KEY"), StripeWebhookSecret: os.Getenv("STRIPE_WEBHOOK_SECRET"), StripeTelegramAgentPriceID: os.Getenv("STRIPE_TELEGRAM_AGENT_PRICE_ID"), - StripeSuccessURL: envOrDefault("STRIPE_SUCCESS_URL", "http://localhost:3000/billing?success=true"), - StripeCancelURL: envOrDefault("STRIPE_CANCEL_URL", "http://localhost:3000/billing?cancelled=true"), + StripeSuccessURL: envOrDefault("STRIPE_SUCCESS_URL", "http://localhost:3000/billing?success=true"), + StripeCancelURL: envOrDefault("STRIPE_CANCEL_URL", "http://localhost:3000/billing?cancelled=true"), SegmentWriteKey: os.Getenv("SEGMENT_WRITE_KEY"), diff --git a/internal/qemu/manager.go b/internal/qemu/manager.go index 19ded6e5..acdc8d23 100644 --- a/internal/qemu/manager.go +++ b/internal/qemu/manager.go @@ -348,6 +348,7 @@ type Config struct { DataDir string // base data directory (e.g., /data) KernelPath string // path to vmlinux kernel ImagesDir string // path to base rootfs images + GoldenDir string // path to worker-local golden snapshot cache QEMUBin string // path to qemu-system-x86_64 binary AgentBinaryPath string // path to osb-agent binary on host (for hot-upgrade) AgentVersion string // expected agent version (for hot-upgrade check) @@ -425,6 +426,9 @@ func NewManager(cfg Config) (*Manager, error) { if cfg.ImagesDir == "" { cfg.ImagesDir = filepath.Join(cfg.DataDir, "images") } + if cfg.GoldenDir == "" { + cfg.GoldenDir = filepath.Join(cfg.DataDir, "golden") + } if cfg.QEMUBin == "" { cfg.QEMUBin = "qemu-system-x86_64" } @@ -515,6 +519,13 @@ func (m *Manager) GoldenVersion() string { return m.goldenVersion } +func (m *Manager) goldenSnapshotDir() string { + if m.cfg.GoldenDir != "" { + return m.cfg.GoldenDir + } + return filepath.Join(m.cfg.DataDir, "golden") +} + // MemoryAllocatedBytes returns the sum of memory committed to currently-running // sandboxes, in bytes. Used by the worker's resource-stats tick to report // oversubscription independent of actual guest workload. @@ -762,7 +773,7 @@ func (m *Manager) PrepareGoldenSnapshot() error { log.Printf("qemu: blobstore golden fetch failed (will try local-only): %v", err) } - goldenDir := filepath.Join(m.cfg.DataDir, "golden") + goldenDir := m.goldenSnapshotDir() memFile := filepath.Join(goldenDir, "mem") rootfsFile := filepath.Join(goldenDir, "rootfs.qcow2") @@ -1052,10 +1063,10 @@ func (m *Manager) PrepareGoldenSnapshot() error { // Returns the old and new golden version strings. func (m *Manager) RebuildGoldenSnapshot() (oldVersion, newVersion string, err error) { oldVersion = m.goldenVersion - goldenDir := filepath.Join(m.cfg.DataDir, "golden") + goldenDir := m.goldenSnapshotDir() // Build new golden in a staging directory - stagingDir := filepath.Join(m.cfg.DataDir, "golden-staging") + stagingDir := goldenDir + "-staging" os.RemoveAll(stagingDir) // clean up any prior failed attempt // Temporarily point goldenDir to staging so PrepareGoldenSnapshot builds there @@ -1063,7 +1074,7 @@ func (m *Manager) RebuildGoldenSnapshot() (oldVersion, newVersion string, err er m.goldenDir = "" // Rename current golden out of the way so PrepareGoldenSnapshot sees no existing snapshot - backupDir := filepath.Join(m.cfg.DataDir, "golden-old") + backupDir := goldenDir + "-old" os.RemoveAll(backupDir) if err := os.Rename(goldenDir, backupDir); err != nil && !os.IsNotExist(err) { m.goldenDir = oldGoldenDir From 31c816cf0df44ba6aa725bcbd97e9abbd601e5f5 Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Thu, 4 Jun 2026 18:08:20 -0700 Subject: [PATCH 18/32] Add resumable sandbox recreate flow --- cmd/worker/main.go | 150 +------------------ internal/api/admin_workers.go | 121 ++++++++++++++++ internal/api/router.go | 1 + internal/db/store.go | 12 ++ internal/qemu/reset.go | 215 ++++++++++++++++++++++++++++ internal/worker/handlers.go | 80 +++++++++++ internal/worker/http_server.go | 7 +- internal/worker/resumable_notice.go | 163 +++++++++++++++++++++ 8 files changed, 600 insertions(+), 149 deletions(-) create mode 100644 internal/worker/resumable_notice.go diff --git a/cmd/worker/main.go b/cmd/worker/main.go index ade6d501..21ba0237 100644 --- a/cmd/worker/main.go +++ b/cmd/worker/main.go @@ -2,7 +2,6 @@ package main import ( "context" - "encoding/json" "fmt" "io" "log" @@ -601,7 +600,7 @@ func main() { } // HTTP server - httpServer := worker.NewHTTPServer(mgr, ptyMgr, execMgr, jwtIssuer, sandboxDBMgr, sbProxy, sbRouter, cfg.SandboxDomain) + httpServer := worker.NewHTTPServer(mgr, ptyMgr, execMgr, jwtIssuer, sandboxDBMgr, sbProxy, sbRouter, cfg.SandboxDomain, store) httpAddr := fmt.Sprintf(":%d", cfg.Port) log.Printf("opensandbox-worker: starting HTTP server on %s", httpAddr) go func() { @@ -714,7 +713,7 @@ func main() { for notice := range notices { log.Printf("opensandbox-worker: PREEMPTION notice from %s - action=%s eta=%s, notifying resumable sandboxes", preemptMon.Name(), notice.Action, notice.ETA.Format(time.RFC3339)) - notifyResumableSandboxesBeforeRestart(context.Background(), mgr, store, sandboxDBMgr, 25*time.Second, notice.ETA) + worker.NotifyResumableSandboxesBeforeRestart(context.Background(), mgr, store, sandboxDBMgr, 25*time.Second, notice.ETA) hb.Stop() return } @@ -925,151 +924,6 @@ func buildCheckpointBackend(label, endpoint, region, accessKeyID, secretAccessKe }) } -func notifyResumableSandboxesBeforeRestart(ctx context.Context, mgr sandbox.Manager, store *db.Store, sandboxDBs *sandbox.SandboxDBManager, noticeWindow time.Duration, eta time.Time) { - if mgr == nil { - return - } - wait := noticeWindow - if !eta.IsZero() { - until := time.Until(eta) - switch { - case until <= 0: - wait = 0 - case until <= 5*time.Second: - wait = until - case until-5*time.Second < wait: - wait = until - 5*time.Second - } - } - if wait < 0 { - wait = 0 - } - - listCtx, cancel := context.WithTimeout(ctx, 5*time.Second) - sandboxes, err := mgr.List(listCtx) - cancel() - if err != nil { - log.Printf("opensandbox-worker: resumable restart notice: failed to list sandboxes: %v", err) - sleepContext(ctx, wait) - return - } - - started := time.Now() - noticeSeconds := int(wait.Seconds()) - if noticeSeconds < 0 { - noticeSeconds = 0 - } - hookTimeout := noticeSeconds - if hookTimeout <= 0 { - hookTimeout = 1 - } - if hookTimeout > 20 { - hookTimeout = 20 - } - - const restartNoticeScript = `notice="${OPENSANDBOX_RESUME_NOTICE_SECONDS:-25}" -export OPENSANDBOX_RESTART_NOTICE_SECONDS="$notice" -for hook in /etc/opencomputer/on-restart-notice /home/sandbox/.opencomputer/on-restart-notice; do - if [ -x "$hook" ]; then - "$hook" "$notice" - fi -done -sync` - - var wg sync.WaitGroup - notified := 0 - for _, sb := range sandboxes { - if sb.Status != "" && sb.Status != "running" { - continue - } - if !isResumableSandboxSession(ctx, store, sb.ID) { - continue - } - - notified++ - if sandboxDBs != nil { - if sdb, dbErr := sandboxDBs.Get(sb.ID); dbErr == nil { - _ = sdb.LogEvent("restart_notice", map[string]string{ - "sandbox_id": sb.ID, - "notice_seconds": fmt.Sprintf("%d", noticeSeconds), - "restart_reason": "worker_preemption", - "preserves_disk": "true", - "preserves_memory": "false", - }) - } - } - - sandboxID := sb.ID - wg.Add(1) - go func() { - defer wg.Done() - execCtx, execCancel := context.WithTimeout(ctx, time.Duration(hookTimeout)*time.Second) - defer execCancel() - _, err := mgr.Exec(execCtx, sandboxID, types.ProcessConfig{ - Command: "/bin/sh", - Args: []string{"-lc", restartNoticeScript}, - Env: map[string]string{ - "OPENSANDBOX_RESUMABLE": "true", - "OPENSANDBOX_RESUME_NOTICE_SECONDS": fmt.Sprintf("%d", noticeSeconds), - }, - Timeout: hookTimeout, - }) - if err != nil { - log.Printf("opensandbox-worker: resumable restart notice: hook failed for %s: %v", sandboxID, err) - } - }() - } - - if notified == 0 { - log.Printf("opensandbox-worker: resumable restart notice: no resumable sandboxes found") - sleepContext(ctx, wait) - return - } - log.Printf("opensandbox-worker: resumable restart notice: notifying %d sandboxes with %ds notice", notified, noticeSeconds) - wg.Wait() - - remaining := wait - time.Since(started) - if remaining > 0 { - sleepContext(ctx, remaining) - } - log.Printf("opensandbox-worker: resumable restart notice: completed for %d sandboxes", notified) -} - -func isResumableSandboxSession(ctx context.Context, store *db.Store, sandboxID string) bool { - if store == nil || sandboxID == "" { - return false - } - sessionCtx, cancel := context.WithTimeout(ctx, 2*time.Second) - defer cancel() - session, err := store.GetSandboxSession(sessionCtx, sandboxID) - if err != nil || session == nil { - if err != nil { - log.Printf("opensandbox-worker: resumable restart notice: failed to load sandbox session %s: %v", sandboxID, err) - } - return false - } - var cfg types.SandboxConfig - if len(session.Config) > 0 { - if err := json.Unmarshal(session.Config, &cfg); err != nil { - log.Printf("opensandbox-worker: resumable restart notice: failed to parse config for %s: %v", sandboxID, err) - return false - } - } - return cfg.IsResumable() -} - -func sleepContext(ctx context.Context, d time.Duration) { - if d <= 0 { - return - } - timer := time.NewTimer(d) - defer timer.Stop() - select { - case <-ctx.Done(): - case <-timer.C: - } -} - // createExecSessionQEMU creates an exec session using a QEMU agent client. func createExecSessionQEMU(agent *qm.AgentClient, sandboxID string, req types.ExecSessionCreateRequest) (*sandbox.ExecSessionHandle, error) { agentPB := &agentpb.ExecSessionCreateRequest{ diff --git a/internal/api/admin_workers.go b/internal/api/admin_workers.go index 841b61f8..a9297dc7 100644 --- a/internal/api/admin_workers.go +++ b/internal/api/admin_workers.go @@ -1,9 +1,16 @@ package api import ( + "bytes" + "encoding/json" + "fmt" "net/http" + "strings" + "time" "github.com/labstack/echo/v4" + "github.com/opensandbox/opensandbox/internal/controlplane" + "github.com/opensandbox/opensandbox/pkg/types" ) // adminSetWorkerDraining toggles the in-memory `Draining` flag on a worker so @@ -76,3 +83,117 @@ func (s *Server) adminEvacuateWorker(c echo.Context) error { "evacuating": true, }) } + +type adminRecreateSandboxRequest struct { + TargetWorkerID string `json:"targetWorkerId,omitempty"` +} + +func (s *Server) adminRecreateSandbox(c echo.Context) error { + if s.store == nil || s.workerRegistry == nil { + return c.JSON(http.StatusServiceUnavailable, map[string]string{"error": "store and worker registry are required"}) + } + sandboxID := c.Param("id") + if sandboxID == "" { + return c.JSON(http.StatusBadRequest, map[string]string{"error": "sandbox id required"}) + } + + var req adminRecreateSandboxRequest + _ = c.Bind(&req) + + session, err := s.store.GetSandboxSession(c.Request().Context(), sandboxID) + if err != nil { + return c.JSON(http.StatusNotFound, map[string]string{"error": "sandbox not found"}) + } + var cfg types.SandboxConfig + if len(session.Config) > 0 { + if err := json.Unmarshal(session.Config, &cfg); err != nil { + return c.JSON(http.StatusBadRequest, map[string]string{"error": "invalid sandbox config: " + err.Error()}) + } + } + if !cfg.IsResumable() { + return c.JSON(http.StatusBadRequest, map[string]string{"error": "sandbox is not resumable"}) + } + cfg.SandboxID = sandboxID + + target := s.findRecreateTarget(session.WorkerID, session.Region, req.TargetWorkerID) + if target == nil { + return c.JSON(http.StatusServiceUnavailable, map[string]string{"error": "no eligible target worker"}) + } + if target.HTTPAddr == "" { + return c.JSON(http.StatusServiceUnavailable, map[string]string{"error": "target worker has no HTTP address"}) + } + + body, _ := json.Marshal(map[string]any{ + "sandboxId": sandboxID, + "config": cfg, + }) + url := strings.TrimRight(target.HTTPAddr, "/") + "/admin/resumable/recreate" + httpReq, err := http.NewRequestWithContext(c.Request().Context(), http.MethodPost, url, bytes.NewReader(body)) + if err != nil { + return c.JSON(http.StatusInternalServerError, map[string]string{"error": err.Error()}) + } + httpReq.Header.Set("Content-Type", "application/json") + client := &http.Client{Timeout: 120 * time.Second} + resp, err := client.Do(httpReq) + if err != nil { + return c.JSON(http.StatusBadGateway, map[string]string{"error": "target worker recreate failed: " + err.Error()}) + } + defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + var workerErr map[string]string + _ = json.NewDecoder(resp.Body).Decode(&workerErr) + msg := workerErr["error"] + if msg == "" { + msg = fmt.Sprintf("target worker returned HTTP %d", resp.StatusCode) + } + return c.JSON(http.StatusBadGateway, map[string]string{"error": msg}) + } + var sb types.Sandbox + if err := json.NewDecoder(resp.Body).Decode(&sb); err != nil { + return c.JSON(http.StatusBadGateway, map[string]string{"error": "decode target worker response: " + err.Error()}) + } + + if err := s.store.UpdateSandboxSessionForRecreate(c.Request().Context(), sandboxID, target.ID); err != nil { + return c.JSON(http.StatusInternalServerError, map[string]string{"error": "update sandbox session: " + err.Error()}) + } + if target.GoldenVersion != "" { + _ = s.store.SetSandboxGoldenVersion(c.Request().Context(), sandboxID, target.GoldenVersion) + } + if s.sandboxAPIProxy != nil { + s.sandboxAPIProxy.InvalidateRouteCache(sandboxID) + } + s.emitEvent("resumed", sandboxID, target.ID, "resumable recreate") + + return c.JSON(http.StatusOK, map[string]any{ + "sandboxID": sandboxID, + "previousWorker": session.WorkerID, + "workerID": target.ID, + "status": sb.Status, + }) +} + +func (s *Server) findRecreateTarget(sourceWorkerID, region, targetWorkerID string) *controlplane.WorkerEntry { + if targetWorkerID != "" { + w := s.workerRegistry.GetWorker(targetWorkerID) + if w == nil || w.Draining { + return nil + } + return w + } + var best *controlplane.WorkerEntry + bestScore := 1e18 + for _, w := range s.workerRegistry.GetAllWorkers() { + if w == nil || w.ID == sourceWorkerID || w.Draining { + continue + } + if region != "" && w.Region != region { + continue + } + score := float64(w.Current)*1000 + w.CPUPct + w.MemPct + w.DiskPct + if best == nil || score < bestScore { + best = w + bestScore = score + } + } + return best +} diff --git a/internal/api/router.go b/internal/api/router.go index 8773facc..054aafa7 100644 --- a/internal/api/router.go +++ b/internal/api/router.go @@ -275,6 +275,7 @@ func NewServer(mgr sandbox.Manager, ptyMgr *sandbox.PTYManager, apiKey string, o admin.POST("/events/clear", s.adminClearEvents) admin.POST("/workers/:id/drain", s.adminSetWorkerDraining) admin.POST("/workers/:id/evacuate", s.adminEvacuateWorker) + admin.POST("/sandboxes/:id/recreate", s.adminRecreateSandbox) admin.GET("/demo/migration", s.demoPingPongPage) admin.GET("/demo/chaos", s.demoChaosPage) admin.GET("/demo/drain", s.adminDrainVisualizerPage) diff --git a/internal/db/store.go b/internal/db/store.go index 09e904fa..c03bcd1b 100644 --- a/internal/db/store.go +++ b/internal/db/store.go @@ -1418,6 +1418,18 @@ func (s *Store) UpdateSandboxSessionForWake(ctx context.Context, sandboxID, newW return err } +// UpdateSandboxSessionForRecreate marks a resumable sandbox running on a new +// worker after disk-only recreation. Unlike wake, this path does not require a +// hibernation row because RAM is intentionally not preserved. +func (s *Store) UpdateSandboxSessionForRecreate(ctx context.Context, sandboxID, newWorkerID string) error { + _, err := s.pool.Exec(ctx, + `UPDATE sandbox_sessions + SET status = 'running', worker_id = $1, stopped_at = NULL, error_msg = NULL + WHERE sandbox_id = $2`, + newWorkerID, sandboxID) + return err +} + // ReconcileWorkerSessions marks stale "running" sessions for a worker on startup. // Sessions with an active checkpoint are set to "hibernated" (recoverable via wake-on-request). // Sessions without a checkpoint are set to "stopped" (VM is gone, no recovery possible). diff --git a/internal/qemu/reset.go b/internal/qemu/reset.go index d4a0f8ec..b7fc72e2 100644 --- a/internal/qemu/reset.go +++ b/internal/qemu/reset.go @@ -10,6 +10,7 @@ import ( "path/filepath" "time" + "github.com/opensandbox/opensandbox/pkg/types" pb "github.com/opensandbox/opensandbox/proto/agent" ) @@ -334,3 +335,217 @@ func (m *Manager) PowerCycleSandbox(ctx context.Context, sandboxID string) (host sandboxID, time.Since(t0).Milliseconds(), freshPort, netCfg.TAPName) return freshPort, nil } + +// StartExistingSandbox cold-boots an existing sandbox directory on this worker. +// It is the disk-only resumable recovery primitive: no savevm/loadvm, no RAM +// preservation, just a fresh QEMU process using the existing rootfs/workspace +// drives for the same sandbox ID. +func (m *Manager) StartExistingSandbox(ctx context.Context, sandboxID string, cfg types.SandboxConfig) (*types.Sandbox, error) { + t0 := time.Now() + + m.mu.Lock() + if _, exists := m.vms[sandboxID]; exists { + m.mu.Unlock() + return nil, fmt.Errorf("sandbox %s is already running on this worker", sandboxID) + } + m.mu.Unlock() + + sandboxDir := filepath.Join(m.cfg.DataDir, "sandboxes", sandboxID) + rootfsPath := detectDrivePath(sandboxDir, "rootfs") + workspacePath := detectDrivePath(sandboxDir, "workspace") + if !fileExists(rootfsPath) || !fileExists(workspacePath) { + return nil, fmt.Errorf("sandbox %s: existing drives missing on this worker (rootfs=%v, workspace=%v, dir=%s)", + sandboxID, fileExists(rootfsPath), fileExists(workspacePath), sandboxDir) + } + + var meta SandboxMeta + if data, err := os.ReadFile(filepath.Join(sandboxDir, "sandbox-meta.json")); err == nil { + _ = json.Unmarshal(data, &meta) + } + template := cfg.Template + if template == "" { + template = meta.Template + } + if template == "" { + template = "default" + } + cpus := cfg.CpuCount + if cpus <= 0 { + cpus = meta.CpuCount + } + if cpus <= 0 { + cpus = m.cfg.DefaultCPUs + } + memMB := cfg.MemoryMB + if memMB <= 0 { + memMB = meta.MemoryMB + } + if memMB <= 0 { + memMB = m.cfg.DefaultMemoryMB + } + guestPort := cfg.Port + if guestPort == 0 { + guestPort = meta.GuestPort + } + if guestPort == 0 { + guestPort = m.cfg.DefaultPort + } + + netCfg, err := m.subnets.Allocate() + if err != nil { + return nil, fmt.Errorf("allocate subnet: %w", err) + } + if err := CreateTAP(netCfg); err != nil { + m.subnets.Release(netCfg.TAPName) + return nil, fmt.Errorf("create TAP: %w", err) + } + hostPort, err := FindFreePort() + if err != nil { + DeleteTAP(netCfg.TAPName) + m.subnets.Release(netCfg.TAPName) + return nil, fmt.Errorf("find free port: %w", err) + } + netCfg.HostPort = hostPort + netCfg.GuestPort = guestPort + if err := AddDNAT(netCfg); err != nil { + DeleteTAP(netCfg.TAPName) + m.subnets.Release(netCfg.TAPName) + return nil, fmt.Errorf("add DNAT: %w", err) + } + if err := AddMetadataDNAT(netCfg.TAPName, netCfg.HostIP); err != nil { + log.Printf("qemu: StartExistingSandbox %s: metadata DNAT failed: %v", sandboxID, err) + } + + guestMAC := generateMAC(sandboxID) + guestCID := m.allocateCID() + bootArgs := fmt.Sprintf( + "console=ttyS0 reboot=k panic=1 root=/dev/vda rw ip=%s::%s:%s::eth0:off init=/sbin/init osb.gateway=%s", + netCfg.GuestIP, netCfg.HostIP, netCfg.Mask, netCfg.HostIP, + ) + qmpSockPath := filepath.Join(sandboxDir, "qmp.sock") + agentSockPath := filepath.Join(sandboxDir, "agent.sock") + os.Remove(qmpSockPath) + os.Remove(agentSockPath) + + logFile, err := os.Create(filepath.Join(sandboxDir, "qemu.log")) + if err != nil { + m.cleanupVM(netCfg, "") + return nil, fmt.Errorf("create log: %w", err) + } + args := m.buildQEMUArgs(cpus, memMB, rootfsPath, workspacePath, + netCfg.TAPName, guestMAC, agentSockPath, qmpSockPath, bootArgs) + cmd := exec.Command(m.cfg.QEMUBin, args...) + cmd.Stdout = logFile + cmd.Stderr = logFile + if err := cmd.Start(); err != nil { + logFile.Close() + m.cleanupVM(netCfg, "") + return nil, fmt.Errorf("start QEMU: %w", err) + } + logFile.Close() + + qmpClient, err := waitForQMP(qmpSockPath, 30*time.Second) + if err != nil { + cmd.Process.Kill() + cmd.Wait() + m.cleanupVM(netCfg, "") + return nil, fmt.Errorf("QMP connect: %w", err) + } + agentClient, err := m.waitForAgentSocket(ctx, agentSockPath, 60*time.Second) + if err != nil { + qmpClient.Close() + cmd.Process.Kill() + cmd.Wait() + m.cleanupVM(netCfg, "") + return nil, fmt.Errorf("agent connect: %w", err) + } + + if err := syncGuestClock(ctx, agentClient); err != nil { + log.Printf("qemu: StartExistingSandbox %s: clock sync failed: %v", sandboxID, err) + } + mountCtx, mountCancel := context.WithTimeout(ctx, 15*time.Second) + _, mountErr := agentClient.Exec(mountCtx, &pb.ExecRequest{ + Command: "/bin/sh", + Args: []string{"-c", "mount /dev/vdb /home/sandbox 2>/dev/null || true; resize2fs /dev/vdb 2>/dev/null || true; chown 1000:1000 /home/sandbox"}, + RunAsRoot: true, + }) + mountCancel() + if mountErr != nil { + log.Printf("qemu: StartExistingSandbox %s: mount /home/sandbox failed: %v", sandboxID, mountErr) + } + m.setupAptCacheBindMount(ctx, sandboxID, agentClient) + m.reinstallProxyCA(ctx, sandboxID, agentClient) + + envsToInject := m.sealSandboxEnvs(ctx, sandboxID, netCfg, agentClient, cfg) + if len(envsToInject) > 0 { + envCtx, envCancel := context.WithTimeout(ctx, 5*time.Second) + if err := agentClient.SetEnvs(envCtx, envsToInject); err != nil { + log.Printf("qemu: StartExistingSandbox %s: SetEnvs failed: %v", sandboxID, err) + } + envCancel() + } + + now := time.Now() + timeout := time.Duration(cfg.Timeout) * time.Second + if timeout <= 0 { + timeout = 300 * time.Second + } + vm := &VMInstance{ + ID: sandboxID, + Template: template, + Status: types.SandboxStatusRunning, + StartedAt: now, + EndAt: now.Add(timeout), + CpuCount: cpus, + MemoryMB: memMB, + baseMemoryMB: memMB, + HostPort: hostPort, + GuestPort: guestPort, + pid: cmd.Process.Pid, + cmd: cmd, + network: netCfg, + sandboxDir: sandboxDir, + agent: agentClient, + qmpSockPath: qmpSockPath, + agentSockPath: agentSockPath, + qmp: qmpClient, + guestMAC: guestMAC, + guestCID: guestCID, + bootArgs: bootArgs, + goldenVersion: m.goldenVersion, + } + + m.mu.Lock() + m.vms[sandboxID] = vm + m.mu.Unlock() + + if m.onSandboxReady != nil { + m.onSandboxReady(sandboxID, netCfg.GuestIP, template, vm.StartedAt) + } + + sbMeta := SandboxMeta{ + SandboxID: sandboxID, + Template: template, + CpuCount: cpus, + MemoryMB: memMB, + GuestPort: guestPort, + } + if metaJSON, err := json.Marshal(sbMeta); err == nil { + if writeErr := os.WriteFile(filepath.Join(sandboxDir, "sandbox-meta.json"), metaJSON, 0644); writeErr != nil { + log.Printf("qemu: WARNING: failed to write sandbox-meta.json for %s: %v", sandboxDir, writeErr) + } + } + + log.Printf("qemu: StartExistingSandbox %s: complete (%dms, port=%d→%d, tap=%s)", + sandboxID, time.Since(t0).Milliseconds(), hostPort, guestPort, netCfg.TAPName) + return &types.Sandbox{ + ID: sandboxID, + Template: template, + Status: types.SandboxStatusRunning, + StartedAt: now, + EndAt: now.Add(timeout), + CpuCount: cpus, + MemoryMB: memMB, + HostPort: hostPort, + }, nil +} diff --git a/internal/worker/handlers.go b/internal/worker/handlers.go index 9b5deee8..90843d4f 100644 --- a/internal/worker/handlers.go +++ b/internal/worker/handlers.go @@ -21,6 +21,86 @@ import ( // jsonMarshal is a helper to marshal JSON for agent session stdin commands. var jsonMarshal = json.Marshal +type restartNoticeRequest struct { + NoticeSeconds int `json:"noticeSeconds"` + ETA string `json:"eta,omitempty"` +} + +type recreateSandboxRequest struct { + SandboxID string `json:"sandboxId"` + Config types.SandboxConfig `json:"config"` +} + +type existingSandboxStarter interface { + StartExistingSandbox(ctx context.Context, sandboxID string, cfg types.SandboxConfig) (*types.Sandbox, error) +} + +func (s *HTTPServer) adminRestartNotice(c echo.Context) error { + var req restartNoticeRequest + if c.Request().Body != nil { + _ = c.Bind(&req) + } + + notice := 25 * time.Second + if req.NoticeSeconds > 0 { + if req.NoticeSeconds > 120 { + return c.JSON(http.StatusBadRequest, map[string]string{"error": "noticeSeconds must be <= 120"}) + } + notice = time.Duration(req.NoticeSeconds) * time.Second + } + + var eta time.Time + if req.ETA != "" { + parsed, err := time.Parse(time.RFC3339, req.ETA) + if err != nil { + return c.JSON(http.StatusBadRequest, map[string]string{"error": "eta must be RFC3339"}) + } + eta = parsed + } + + notified := NotifyResumableSandboxesBeforeRestart(c.Request().Context(), s.manager, s.store, s.sandboxDBs, notice, eta) + return c.JSON(http.StatusAccepted, map[string]any{ + "notified": notified, + "noticeSeconds": int(notice.Seconds()), + }) +} + +func (s *HTTPServer) adminRecreateSandbox(c echo.Context) error { + var req recreateSandboxRequest + if err := c.Bind(&req); err != nil { + return c.JSON(http.StatusBadRequest, map[string]string{"error": "invalid request body: " + err.Error()}) + } + if req.SandboxID == "" { + return c.JSON(http.StatusBadRequest, map[string]string{"error": "sandboxId required"}) + } + starter, ok := s.manager.(existingSandboxStarter) + if !ok { + return c.JSON(http.StatusNotImplemented, map[string]string{"error": "backend does not support recreating existing sandbox disks"}) + } + sb, err := starter.StartExistingSandbox(c.Request().Context(), req.SandboxID, req.Config) + if err != nil { + return c.JSON(http.StatusInternalServerError, map[string]string{"error": err.Error()}) + } + if s.router != nil { + timeout := req.Config.Timeout + if timeout < 0 { + timeout = 0 + } + s.router.Register(req.SandboxID, time.Duration(timeout)*time.Second) + } + if s.sandboxDBs != nil { + if sdb, dbErr := s.sandboxDBs.Get(req.SandboxID); dbErr == nil { + _ = sdb.LogEvent("resumed", map[string]string{ + "sandbox_id": req.SandboxID, + "restart_reason": "resumable_recreate", + "preserves_disk": "true", + "preserves_memory": "false", + }) + } + } + return c.JSON(http.StatusOK, sb) +} + func (s *HTTPServer) setTimeout(c echo.Context) error { if s.router == nil { return c.JSON(http.StatusServiceUnavailable, map[string]string{ diff --git a/internal/worker/http_server.go b/internal/worker/http_server.go index 14eae37f..4063dae2 100644 --- a/internal/worker/http_server.go +++ b/internal/worker/http_server.go @@ -6,6 +6,7 @@ import ( "github.com/labstack/echo/v4" "github.com/labstack/echo/v4/middleware" "github.com/opensandbox/opensandbox/internal/auth" + "github.com/opensandbox/opensandbox/internal/db" "github.com/opensandbox/opensandbox/internal/observability" "github.com/opensandbox/opensandbox/internal/obslog" "github.com/opensandbox/opensandbox/internal/proxy" @@ -23,10 +24,11 @@ type HTTPServer struct { sandboxDBs *sandbox.SandboxDBManager router *sandbox.SandboxRouter sandboxDomain string + store *db.Store } // NewHTTPServer creates a new worker HTTP server for direct SDK access. -func NewHTTPServer(mgr sandbox.Manager, ptyMgr *sandbox.PTYManager, execMgr *sandbox.ExecSessionManager, jwtIssuer *auth.JWTIssuer, sandboxDBs *sandbox.SandboxDBManager, sbProxy *proxy.SandboxProxy, sbRouter *sandbox.SandboxRouter, sandboxDomain string) *HTTPServer { +func NewHTTPServer(mgr sandbox.Manager, ptyMgr *sandbox.PTYManager, execMgr *sandbox.ExecSessionManager, jwtIssuer *auth.JWTIssuer, sandboxDBs *sandbox.SandboxDBManager, sbProxy *proxy.SandboxProxy, sbRouter *sandbox.SandboxRouter, sandboxDomain string, store *db.Store) *HTTPServer { e := echo.New() e.HideBanner = true e.HidePort = true @@ -40,6 +42,7 @@ func NewHTTPServer(mgr sandbox.Manager, ptyMgr *sandbox.PTYManager, execMgr *san sandboxDBs: sandboxDBs, router: sbRouter, sandboxDomain: sandboxDomain, + store: store, } // Global middleware. Sentry goes first so it can observe panics and @@ -63,6 +66,8 @@ func NewHTTPServer(mgr sandbox.Manager, ptyMgr *sandbox.PTYManager, execMgr *san e.GET("/health", func(c echo.Context) error { return c.JSON(http.StatusOK, map[string]string{"status": "ok", "role": "worker"}) }) + e.POST("/admin/resumable/restart-notice", s.adminRestartNotice) + e.POST("/admin/resumable/recreate", s.adminRecreateSandbox) // All sandbox routes require JWT auth api := e.Group("") diff --git a/internal/worker/resumable_notice.go b/internal/worker/resumable_notice.go new file mode 100644 index 00000000..745edff9 --- /dev/null +++ b/internal/worker/resumable_notice.go @@ -0,0 +1,163 @@ +package worker + +import ( + "context" + "encoding/json" + "fmt" + "log" + "sync" + "time" + + "github.com/opensandbox/opensandbox/internal/db" + "github.com/opensandbox/opensandbox/internal/sandbox" + "github.com/opensandbox/opensandbox/pkg/types" +) + +// NotifyResumableSandboxesBeforeRestart runs the same in-guest notice hook used +// by cloud preemption handling. It gives resumable sandboxes a bounded window +// to flush state to disk before the host restarts. +func NotifyResumableSandboxesBeforeRestart(ctx context.Context, mgr sandbox.Manager, store *db.Store, sandboxDBs *sandbox.SandboxDBManager, noticeWindow time.Duration, eta time.Time) int { + if mgr == nil { + return 0 + } + wait := noticeWindow + if !eta.IsZero() { + until := time.Until(eta) + switch { + case until <= 0: + wait = 0 + case until <= 5*time.Second: + wait = until + case until-5*time.Second < wait: + wait = until - 5*time.Second + } + } + if wait < 0 { + wait = 0 + } + + listCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + sandboxes, err := mgr.List(listCtx) + cancel() + if err != nil { + log.Printf("opensandbox-worker: resumable restart notice: failed to list sandboxes: %v", err) + sleepContext(ctx, wait) + return 0 + } + + started := time.Now() + noticeSeconds := int(wait.Seconds()) + if noticeSeconds < 0 { + noticeSeconds = 0 + } + hookTimeout := noticeSeconds + if hookTimeout <= 0 { + hookTimeout = 1 + } + if hookTimeout > 20 { + hookTimeout = 20 + } + + const restartNoticeScript = `notice="${OPENSANDBOX_RESUME_NOTICE_SECONDS:-25}" +export OPENSANDBOX_RESTART_NOTICE_SECONDS="$notice" +for hook in /etc/opencomputer/on-restart-notice /home/sandbox/.opencomputer/on-restart-notice; do + if [ -x "$hook" ]; then + "$hook" "$notice" + fi +done +sync` + + var wg sync.WaitGroup + notified := 0 + for _, sb := range sandboxes { + if sb.Status != "" && sb.Status != "running" { + continue + } + if !isResumableSandboxSession(ctx, store, sb.ID) { + continue + } + + notified++ + if sandboxDBs != nil { + if sdb, dbErr := sandboxDBs.Get(sb.ID); dbErr == nil { + _ = sdb.LogEvent("restart_notice", map[string]string{ + "sandbox_id": sb.ID, + "notice_seconds": fmt.Sprintf("%d", noticeSeconds), + "restart_reason": "worker_preemption", + "preserves_disk": "true", + "preserves_memory": "false", + }) + } + } + + sandboxID := sb.ID + wg.Add(1) + go func() { + defer wg.Done() + execCtx, execCancel := context.WithTimeout(ctx, time.Duration(hookTimeout)*time.Second) + defer execCancel() + _, err := mgr.Exec(execCtx, sandboxID, types.ProcessConfig{ + Command: "/bin/sh", + Args: []string{"-lc", restartNoticeScript}, + Env: map[string]string{ + "OPENSANDBOX_RESUMABLE": "true", + "OPENSANDBOX_RESUME_NOTICE_SECONDS": fmt.Sprintf("%d", noticeSeconds), + }, + Timeout: hookTimeout, + }) + if err != nil { + log.Printf("opensandbox-worker: resumable restart notice: hook failed for %s: %v", sandboxID, err) + } + }() + } + + if notified == 0 { + log.Printf("opensandbox-worker: resumable restart notice: no resumable sandboxes found") + sleepContext(ctx, wait) + return 0 + } + log.Printf("opensandbox-worker: resumable restart notice: notifying %d sandboxes with %ds notice", notified, noticeSeconds) + wg.Wait() + + remaining := wait - time.Since(started) + if remaining > 0 { + sleepContext(ctx, remaining) + } + log.Printf("opensandbox-worker: resumable restart notice: completed for %d sandboxes", notified) + return notified +} + +func isResumableSandboxSession(ctx context.Context, store *db.Store, sandboxID string) bool { + if store == nil || sandboxID == "" { + return false + } + sessionCtx, cancel := context.WithTimeout(ctx, 2*time.Second) + defer cancel() + session, err := store.GetSandboxSession(sessionCtx, sandboxID) + if err != nil || session == nil { + if err != nil { + log.Printf("opensandbox-worker: resumable restart notice: failed to load sandbox session %s: %v", sandboxID, err) + } + return false + } + var cfg types.SandboxConfig + if len(session.Config) > 0 { + if err := json.Unmarshal(session.Config, &cfg); err != nil { + log.Printf("opensandbox-worker: resumable restart notice: failed to parse config for %s: %v", sandboxID, err) + return false + } + } + return cfg.IsResumable() +} + +func sleepContext(ctx context.Context, d time.Duration) { + if d <= 0 { + return + } + timer := time.NewTimer(d) + defer timer.Stop() + select { + case <-ctx.Done(): + case <-timer.C: + } +} From 56d0277aa5a191d8af0fc5436e748a96b87c576f Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Fri, 5 Jun 2026 08:38:38 -0700 Subject: [PATCH 19/32] Handle resumable worker shutdown safely --- cmd/worker/main.go | 92 +++++++++++++++++++++---- internal/db/migration_pgfixture_test.go | 55 +++++++++++++++ internal/db/store.go | 38 ++++++++-- internal/qemu/golden_version.go | 21 ++++++ internal/qemu/manager.go | 8 +++ internal/qemu/reset.go | 31 +++++++-- internal/worker/handlers.go | 6 ++ 7 files changed, 225 insertions(+), 26 deletions(-) diff --git a/cmd/worker/main.go b/cmd/worker/main.go index 21ba0237..a25bd3cb 100644 --- a/cmd/worker/main.go +++ b/cmd/worker/main.go @@ -2,6 +2,7 @@ package main import ( "context" + "encoding/json" "fmt" "io" "log" @@ -308,9 +309,15 @@ func main() { if len(vms) == 0 { return } - log.Printf("opensandbox-worker: hibernating %d sandboxes...", len(vms)) + skip := shutdownHibernateSkipSet(context.Background(), store, cfg.WorkerID, vms) + hibernateCount := len(vms) - len(skip) + if hibernateCount == 0 { + log.Printf("opensandbox-worker: shutdown skipping hibernation for %d sandbox(es)", len(skip)) + return + } + log.Printf("opensandbox-worker: hibernating %d sandboxes (skipping %d)...", hibernateCount, len(skip)) shutCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) - results := qmMgr.HibernateAll(shutCtx, checkpointStore) + results := qmMgr.HibernateAllExcept(shutCtx, checkpointStore, skip) cancel() // Log which VMs were NOT hibernated @@ -324,7 +331,7 @@ func main() { log.Printf("opensandbox-worker: %d VMs failed to hibernate: %v", len(failed), failed) } - processHibernateResults(results, store, checkpointStore, sandboxDBMgr, func(r interface{}) (string, string, error) { + processHibernateResults(results, store, checkpointStore, sandboxDBMgr, cfg.WorkerID, func(r interface{}) (string, string, error) { hr := r.(qm.HibernateAllResult) return hr.SandboxID, hr.HibernationKey, hr.Err }) @@ -1134,6 +1141,48 @@ func deleteOldHibernation(store *storage.CheckpointStore, key string) { } } +func shutdownHibernateSkipSet(ctx context.Context, store *db.Store, workerID string, sandboxes []types.Sandbox) map[string]struct{} { + skip := make(map[string]struct{}) + if store == nil { + return skip + } + for _, sb := range sandboxes { + if sb.ID == "" { + continue + } + session, err := store.GetSandboxSession(ctx, sb.ID) + if err != nil { + log.Printf("opensandbox-worker: shutdown hibernate: could not load session %s: %v", sb.ID, err) + continue + } + if session.WorkerID != workerID { + log.Printf("opensandbox-worker: shutdown hibernate: skipping %s; DB owner is %s, not %s", sb.ID, session.WorkerID, workerID) + skip[sb.ID] = struct{}{} + continue + } + if sandboxSessionIsResumable(session) { + log.Printf("opensandbox-worker: shutdown hibernate: skipping resumable sandbox %s", sb.ID) + skip[sb.ID] = struct{}{} + } + } + return skip +} + +func sandboxSessionIsResumable(session *db.SandboxSession) bool { + if session == nil || len(session.Config) == 0 { + return false + } + var cfg types.SandboxConfig + if err := json.Unmarshal(session.Config, &cfg); err != nil { + return false + } + return cfg.IsResumable() +} + +func sandboxSessionOwnedByWorker(session *db.SandboxSession, workerID string) bool { + return session != nil && session.WorkerID == workerID +} + // processHibernateResults handles results from HibernateAll for both backends. // // In addition to updating cell-local PG, we LogEvent("hibernated") into the @@ -1143,10 +1192,32 @@ func deleteOldHibernation(store *storage.CheckpointStore, key string) { // sync. Without this, the bulk-shutdown path silently skipped the lifecycle // event the gRPC HibernateSandbox handler emits per call, and D1 stayed // "running" until something else nudged it. -func processHibernateResults(results interface{}, store *db.Store, checkpointStore *storage.CheckpointStore, sandboxDBs *sandbox.SandboxDBManager, extract func(interface{}) (string, string, error)) { +func processHibernateResults(results interface{}, store *db.Store, checkpointStore *storage.CheckpointStore, sandboxDBs *sandbox.SandboxDBManager, workerID string, extract func(interface{}) (string, string, error)) { switch rs := results.(type) { case []qm.HibernateAllResult: for _, r := range rs { + var session *db.SandboxSession + if store != nil { + var err error + session, err = store.GetSandboxSession(context.Background(), r.SandboxID) + if err != nil { + log.Printf("opensandbox-worker: shutdown hibernate: could not load session %s: %v", r.SandboxID, err) + } + } + if store != nil && session != nil && !sandboxSessionOwnedByWorker(session, workerID) { + log.Printf("opensandbox-worker: shutdown hibernate: ignoring result for %s; DB owner is %s, not %s", r.SandboxID, session.WorkerID, workerID) + if sandboxDBs != nil { + _ = sandboxDBs.Remove(r.SandboxID) + } + continue + } + if sandboxSessionIsResumable(session) { + log.Printf("opensandbox-worker: shutdown hibernate: ignoring result for resumable sandbox %s", r.SandboxID) + if sandboxDBs != nil { + _ = sandboxDBs.Remove(r.SandboxID) + } + continue + } if r.Err != nil { log.Printf("opensandbox-worker: hibernate failed for %s: %v", r.SandboxID, r.Err) if store != nil { @@ -1162,14 +1233,11 @@ func processHibernateResults(results interface{}, store *db.Store, checkpointSto continue } log.Printf("opensandbox-worker: hibernated %s (key=%s)", r.SandboxID, r.HibernationKey) - if store != nil { - session, err := store.GetSandboxSession(context.Background(), r.SandboxID) - if err == nil { - _, superseded, _ := store.CreateHibernation(context.Background(), r.SandboxID, session.OrgID, - r.HibernationKey, 0, session.Region, session.Template, session.Config) - deleteOldHibernation(checkpointStore, superseded) - _ = store.UpdateSandboxSessionStatus(context.Background(), r.SandboxID, "hibernated", nil) - } + if store != nil && session != nil { + _, superseded, _ := store.CreateHibernation(context.Background(), r.SandboxID, session.OrgID, + r.HibernationKey, 0, session.Region, session.Template, session.Config) + deleteOldHibernation(checkpointStore, superseded) + _ = store.UpdateSandboxSessionStatus(context.Background(), r.SandboxID, "hibernated", nil) } if sandboxDBs != nil { if sdb, err := sandboxDBs.Get(r.SandboxID); err == nil { diff --git a/internal/db/migration_pgfixture_test.go b/internal/db/migration_pgfixture_test.go index 3810d9ba..fed2cbf9 100644 --- a/internal/db/migration_pgfixture_test.go +++ b/internal/db/migration_pgfixture_test.go @@ -81,3 +81,58 @@ func TestCompleteMigrationDoesNotRecoverUnrelatedError_pgfixture(t *testing.T) { t.Fatalf("expected unrelated error message preserved, got %#v", sess.ErrorMsg) } } + +func TestReconcileWorkerSessionsSkipsResumable_pgfixture(t *testing.T) { + ctx := context.Background() + store := openPgStore(t) + orgID := seedOrgWithCap(t, store, 16) + workerID := "worker-reconcile-resumable" + + resumableID := freshSandboxID("reconcile-resumable") + hibernatableID := freshSandboxID("reconcile-hibernatable") + stoppableID := freshSandboxID("reconcile-stoppable") + + if _, err := store.CreateSandboxSession(ctx, resumableID, orgID, nil, "default", "us-east-2", workerID, json.RawMessage(`{"resumable":true}`), json.RawMessage(`{}`), nil); err != nil { + t.Fatalf("create resumable session: %v", err) + } + if _, err := store.CreateSandboxSession(ctx, hibernatableID, orgID, nil, "default", "us-east-2", workerID, json.RawMessage(`{}`), json.RawMessage(`{}`), nil); err != nil { + t.Fatalf("create hibernatable session: %v", err) + } + if _, _, err := store.CreateHibernation(ctx, hibernatableID, orgID, "s3://checkpoint", 123, "us-east-2", "default", json.RawMessage(`{}`)); err != nil { + t.Fatalf("create hibernation: %v", err) + } + if _, err := store.CreateSandboxSession(ctx, stoppableID, orgID, nil, "default", "us-east-2", workerID, json.RawMessage(`{}`), json.RawMessage(`{}`), nil); err != nil { + t.Fatalf("create stoppable session: %v", err) + } + + hibernated, stopped, err := store.ReconcileWorkerSessions(ctx, workerID) + if err != nil { + t.Fatalf("reconcile worker sessions: %v", err) + } + if !containsOrphan(hibernated, hibernatableID) { + t.Fatalf("expected hibernatable sandbox to be hibernated, got %#v", hibernated) + } + if !containsOrphan(stopped, stoppableID) { + t.Fatalf("expected stoppable sandbox to be stopped, got %#v", stopped) + } + if containsOrphan(hibernated, resumableID) || containsOrphan(stopped, resumableID) { + t.Fatalf("resumable sandbox should not be reconciled, hibernated=%#v stopped=%#v", hibernated, stopped) + } + + resumable, err := store.GetSandboxSession(ctx, resumableID) + if err != nil { + t.Fatalf("get resumable session: %v", err) + } + if resumable.Status != "running" { + t.Fatalf("expected resumable session to stay running, got %q", resumable.Status) + } +} + +func containsOrphan(rows []OrphanedSandbox, sandboxID string) bool { + for _, row := range rows { + if row.SandboxID == sandboxID { + return true + } + } + return false +} diff --git a/internal/db/store.go b/internal/db/store.go index c03bcd1b..6c049c39 100644 --- a/internal/db/store.go +++ b/internal/db/store.go @@ -835,15 +835,21 @@ func (s *Store) RecoverStaleMigrations(ctx context.Context, maxAge time.Duration return int(tag.RowsAffected()), nil } -// MarkOrphanedSandboxes marks running sandboxes on dead workers as error -// and returns the affected (sandbox_id, org_id, worker_id) tuples so the -// caller can publish `stopped` lifecycle events. liveWorkers is the set -// of worker IDs currently registered. Without the returned IDs the maintenance -// loop's PG sweep would never reach D1 sandboxes_index, which is exactly -// the post-cutover ghost-row bug. +// MarkOrphanedSandboxes marks non-resumable running sandboxes on dead workers +// as error and returns the affected (sandbox_id, org_id, worker_id) tuples so +// the caller can publish `stopped` lifecycle events. liveWorkers is the set of +// worker IDs currently registered. Resumable sandboxes are deliberately skipped +// here: the scaler's dead-worker recovery loop needs those rows to remain +// running long enough to recreate them on another worker. func (s *Store) MarkOrphanedSandboxes(ctx context.Context, liveWorkers map[string]bool) ([]OrphanedSandbox, error) { rows, err := s.pool.Query(ctx, - `SELECT DISTINCT worker_id FROM sandbox_sessions WHERE status = 'running'`) + `SELECT DISTINCT worker_id FROM sandbox_sessions + WHERE status = 'running' + AND NOT ( + COALESCE((config->>'resumable')::boolean, false) + OR config->>'sandboxFamily' = 'spot' + OR config->>'sandboxFamily' = 'resumable' + )`) if err != nil { return nil, err } @@ -865,6 +871,11 @@ func (s *Store) MarkOrphanedSandboxes(ctx context.Context, liveWorkers map[strin upd, err := s.pool.Query(ctx, `UPDATE sandbox_sessions SET status = 'error', error_msg = 'worker lost', stopped_at = now() WHERE worker_id = $1 AND status = 'running' + AND NOT ( + COALESCE((config->>'resumable')::boolean, false) + OR config->>'sandboxFamily' = 'spot' + OR config->>'sandboxFamily' = 'resumable' + ) RETURNING sandbox_id, org_id, worker_id`, workerID) if err != nil { continue @@ -1433,6 +1444,9 @@ func (s *Store) UpdateSandboxSessionForRecreate(ctx context.Context, sandboxID, // ReconcileWorkerSessions marks stale "running" sessions for a worker on startup. // Sessions with an active checkpoint are set to "hibernated" (recoverable via wake-on-request). // Sessions without a checkpoint are set to "stopped" (VM is gone, no recovery possible). +// Resumable sessions are skipped: their disk is the recovery source of truth, +// so the control plane's reverse-reconcile/dead-worker path should recreate +// them instead of converting them to hibernated/stopped on worker restart. // Returns the affected sandbox identities so the caller can emit lifecycle // events; without them D1 drifts from PG every time a worker restarts. func (s *Store) ReconcileWorkerSessions(ctx context.Context, workerID string) (hibernated, stopped []OrphanedSandbox, err error) { @@ -1446,6 +1460,11 @@ func (s *Store) ReconcileWorkerSessions(ctx context.Context, workerID string) (h hibernatedRows, err := tx.Query(ctx, `UPDATE sandbox_sessions SET status = 'hibernated' WHERE worker_id = $1 AND status = 'running' + AND NOT ( + COALESCE((config->>'resumable')::boolean, false) + OR config->>'sandboxFamily' = 'spot' + OR config->>'sandboxFamily' = 'resumable' + ) AND sandbox_id IN ( SELECT sandbox_id FROM sandbox_hibernations WHERE restored_at IS NULL AND expired_at IS NULL @@ -1475,6 +1494,11 @@ func (s *Store) ReconcileWorkerSessions(ctx context.Context, workerID string) (h `UPDATE sandbox_sessions SET status = 'stopped', stopped_at = now(), error_msg = 'worker restarted' WHERE worker_id = $1 AND status = 'running' + AND NOT ( + COALESCE((config->>'resumable')::boolean, false) + OR config->>'sandboxFamily' = 'spot' + OR config->>'sandboxFamily' = 'resumable' + ) RETURNING sandbox_id, org_id, worker_id`, workerID) if err != nil { return hibernated, nil, fmt.Errorf("failed to reconcile stopped sessions: %w", err) diff --git a/internal/qemu/golden_version.go b/internal/qemu/golden_version.go index 2fca2247..bb4e2047 100644 --- a/internal/qemu/golden_version.go +++ b/internal/qemu/golden_version.go @@ -5,6 +5,8 @@ import ( "fmt" "io" "os" + "path/filepath" + "strings" ) // computeGoldenVersion computes a version hash for a golden snapshot by hashing @@ -18,6 +20,10 @@ import ( // serve stale overlays against fresh bases, producing ext4 directory-block // checksum errors in guests. func computeGoldenVersion(baseImagePath string) (string, error) { + if v, err := readGoldenVersionSidecar(baseImagePath); err == nil && v != "" { + return v, nil + } + f, err := os.Open(baseImagePath) if err != nil { return "", fmt.Errorf("open base image: %w", err) @@ -31,6 +37,21 @@ func computeGoldenVersion(baseImagePath string) (string, error) { return fmt.Sprintf("%x", h.Sum(nil))[:16], nil } +func readGoldenVersionSidecar(baseImagePath string) (string, error) { + b, err := os.ReadFile(filepath.Join(filepath.Dir(baseImagePath), "golden-version")) + if err != nil { + return "", err + } + v := strings.TrimSpace(string(b)) + if v == "" { + return "", nil + } + if len(v) > 16 { + v = v[:16] + } + return v, nil +} + // ComputeGoldenVersion is the exported entry point used by cmd/worker's // "golden-version" subcommand so Packer invokes the same hash function // the runtime uses for archive-key lookups. diff --git a/internal/qemu/manager.go b/internal/qemu/manager.go index acdc8d23..8033a8c9 100644 --- a/internal/qemu/manager.go +++ b/internal/qemu/manager.go @@ -2173,9 +2173,17 @@ type HibernateAllResult struct { // HibernateAll hibernates all running VMs concurrently. func (m *Manager) HibernateAll(ctx context.Context, checkpointStore *storage.CheckpointStore) []HibernateAllResult { + return m.HibernateAllExcept(ctx, checkpointStore, nil) +} + +// HibernateAllExcept hibernates all running VMs except IDs in skip. +func (m *Manager) HibernateAllExcept(ctx context.Context, checkpointStore *storage.CheckpointStore, skip map[string]struct{}) []HibernateAllResult { m.mu.RLock() ids := make([]string, 0, len(m.vms)) for id := range m.vms { + if _, ok := skip[id]; ok { + continue + } ids = append(ids, id) } m.mu.RUnlock() diff --git a/internal/qemu/reset.go b/internal/qemu/reset.go index b7fc72e2..10011b38 100644 --- a/internal/qemu/reset.go +++ b/internal/qemu/reset.go @@ -337,9 +337,12 @@ func (m *Manager) PowerCycleSandbox(ctx context.Context, sandboxID string) (host } // StartExistingSandbox cold-boots an existing sandbox directory on this worker. -// It is the disk-only resumable recovery primitive: no savevm/loadvm, no RAM -// preservation, just a fresh QEMU process using the existing rootfs/workspace -// drives for the same sandbox ID. +// It is the disk-only resumable recovery primitive: no savevm/loadvm and no RAM +// preservation. Only the workspace/data disk is durable; the rootfs is reset +// from this worker's clean golden image before boot. This avoids cold-booting a +// rootfs qcow2 that was last used by a VM restored from a golden memory snapshot +// or killed mid-flight, both of which can leave ext4 metadata unsafe to mount +// as a fresh boot device. func (m *Manager) StartExistingSandbox(ctx context.Context, sandboxID string, cfg types.SandboxConfig) (*types.Sandbox, error) { t0 := time.Now() @@ -351,12 +354,26 @@ func (m *Manager) StartExistingSandbox(ctx context.Context, sandboxID string, cf m.mu.Unlock() sandboxDir := filepath.Join(m.cfg.DataDir, "sandboxes", sandboxID) - rootfsPath := detectDrivePath(sandboxDir, "rootfs") workspacePath := detectDrivePath(sandboxDir, "workspace") - if !fileExists(rootfsPath) || !fileExists(workspacePath) { - return nil, fmt.Errorf("sandbox %s: existing drives missing on this worker (rootfs=%v, workspace=%v, dir=%s)", - sandboxID, fileExists(rootfsPath), fileExists(workspacePath), sandboxDir) + if !fileExists(workspacePath) { + return nil, fmt.Errorf("sandbox %s: existing workspace drive missing on this worker (dir=%s)", sandboxID, sandboxDir) + } + rootfsPath := filepath.Join(sandboxDir, "rootfs.qcow2") + goldenRootfs := filepath.Join(m.goldenDir, "rootfs.qcow2") + if !fileExists(goldenRootfs) { + return nil, fmt.Errorf("sandbox %s: golden rootfs missing on this worker (%s)", sandboxID, goldenRootfs) + } + tmpRootfs := filepath.Join(sandboxDir, fmt.Sprintf(".rootfs.%d.tmp", time.Now().UnixNano())) + if err := copyFileReflink(goldenRootfs, tmpRootfs); err != nil { + os.Remove(tmpRootfs) + return nil, fmt.Errorf("reset rootfs from golden: %w", err) + } + if err := os.Rename(tmpRootfs, rootfsPath); err != nil { + os.Remove(tmpRootfs) + return nil, fmt.Errorf("replace rootfs from golden: %w", err) } + log.Printf("qemu: StartExistingSandbox %s: rootfs reset from golden %s (%dms)", + sandboxID, m.goldenVersion, time.Since(t0).Milliseconds()) var meta SandboxMeta if data, err := os.ReadFile(filepath.Join(sandboxDir, "sandbox-meta.json")); err == nil { diff --git a/internal/worker/handlers.go b/internal/worker/handlers.go index 90843d4f..57067186 100644 --- a/internal/worker/handlers.go +++ b/internal/worker/handlers.go @@ -87,6 +87,12 @@ func (s *HTTPServer) adminRecreateSandbox(c echo.Context) error { timeout = 0 } s.router.Register(req.SandboxID, time.Duration(timeout)*time.Second) + if timeout == 0 { + // Register applies the worker default timeout for zero values. For + // disk-only resumable recreates, zero means persistent: keep the + // router entry for request routing, but remove the idle timer. + s.router.SetTimeout(req.SandboxID, 0) + } } if s.sandboxDBs != nil { if sdb, dbErr := s.sandboxDBs.Get(req.SandboxID); dbErr == nil { From 8a60e7a880da8ff92c3c340d35b7134ec4249a36 Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Fri, 5 Jun 2026 10:06:04 -0700 Subject: [PATCH 20/32] Add Burst worker pool support --- cmd/server/main.go | 31 +-- cmd/worker/main.go | 1 + docs/api-reference/sandboxes/create.mdx | 4 +- docs/docs.json | 4 +- docs/reference/python-sdk/sandbox.mdx | 10 +- docs/reference/typescript-sdk/sandbox.mdx | 10 +- ...able-sandboxes.mdx => burst-sandboxes.mdx} | 39 ++-- internal/api/sandbox.go | 23 ++- internal/compute/ec2.go | 184 +++++++++++++++--- internal/compute/pool.go | 6 +- internal/config/config.go | 56 ++++-- internal/controlplane/reconcile.go | 3 +- internal/controlplane/redis_registry.go | 46 ++++- internal/controlplane/scaler.go | 64 +++++- internal/controlplane/worker_registry.go | 1 + internal/db/store.go | 8 + internal/worker/grpc_server.go | 3 +- internal/worker/redis_heartbeat.go | 75 +++---- internal/worker/resumable_notice.go | 1 + pkg/types/sandbox.go | 28 ++- pkg/types/sandbox_family_test.go | 43 +++- sdks/python/opencomputer/sandbox.py | 10 +- sdks/python/pyproject.toml | 2 +- sdks/typescript/package-lock.json | 4 +- sdks/typescript/package.json | 2 +- sdks/typescript/src/sandbox.ts | 8 +- 26 files changed, 493 insertions(+), 173 deletions(-) rename docs/sandboxes/{resumable-sandboxes.mdx => burst-sandboxes.mdx} (82%) diff --git a/cmd/server/main.go b/cmd/server/main.go index 093d6a14..34867813 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -284,6 +284,7 @@ func main() { spec := compute.WorkerSpec{ CellID: cfg.CellID, Region: cfg.Region, + WorkerPool: cfg.WorkerPool, DatabaseURL: workerDBURL, RedisURL: workerRedisURL, JWTSecret: cfg.JWTSecret, @@ -441,17 +442,24 @@ func main() { case "aws": ec2Pool, err := compute.NewEC2Pool(compute.EC2PoolConfig{ - Region: cfg.S3Region, - AccessKeyID: cfg.S3AccessKeyID, - SecretAccessKey: cfg.S3SecretAccessKey, - AMI: cfg.EC2AMI, - InstanceType: cfg.EC2InstanceType, - SubnetID: cfg.EC2SubnetID, - SecurityGroupID: cfg.EC2SecurityGroupID, - KeyName: cfg.EC2KeyName, - IAMInstanceProfile: cfg.EC2IAMInstanceProfile, - SecretsARN: cfg.SecretsARN, - SSMParameterName: cfg.EC2SSMParameterName, + Region: cfg.Region, + AccessKeyID: cfg.EC2AccessKeyID, + SecretAccessKey: cfg.EC2SecretAccessKey, + AMI: cfg.EC2AMI, + InstanceType: cfg.EC2InstanceType, + SubnetID: cfg.EC2SubnetID, + SecurityGroupID: cfg.EC2SecurityGroupID, + KeyName: cfg.EC2KeyName, + IAMInstanceProfile: cfg.EC2IAMInstanceProfile, + SecretsARN: cfg.SecretsARN, + SSMParameterName: cfg.EC2SSMParameterName, + MarketType: cfg.EC2MarketType, + CellID: cfg.CellID, + SharedSandboxDataVolumeID: cfg.EC2SharedSandboxDataVolumeID, + SharedGoldensVolumeID: cfg.EC2SharedGoldensVolumeID, + OCFS2ClusterName: cfg.EC2OCFS2ClusterName, + OCFS2ExpectedNodes: cfg.EC2OCFS2ExpectedNodes, + OCFS2MaxNodes: cfg.EC2OCFS2MaxNodes, }) if err != nil { log.Fatalf("opensandbox: failed to create EC2 pool: %v", err) @@ -498,6 +506,7 @@ func main() { MinWorkers: cfg.MinWorkersPerRegion, MaxWorkers: cfg.MaxWorkersPerRegion, IdleReserve: cfg.IdleReserveWorkers, + WorkerPool: cfg.WorkerPool, MachineSizes: machineSizes, // For "migrated" event emit after scaler-driven migrations // (rolling replace, evacuation) — keeps D1 sandboxes_index diff --git a/cmd/worker/main.go b/cmd/worker/main.go index a25bd3cb..fb680e75 100644 --- a/cmd/worker/main.go +++ b/cmd/worker/main.go @@ -627,6 +627,7 @@ func main() { if err != nil { log.Printf("opensandbox-worker: Redis heartbeat not available: %v", err) } else { + hb.SetPool(cfg.WorkerPool) hb.SetWorkerVersion(WorkerVersion) if qemuMgr != nil { hb.SetGoldenVersion(qemuMgr.GoldenVersion()) diff --git a/docs/api-reference/sandboxes/create.mdx b/docs/api-reference/sandboxes/create.mdx index d8a105bc..906da964 100644 --- a/docs/api-reference/sandboxes/create.mdx +++ b/docs/api-reference/sandboxes/create.mdx @@ -21,8 +21,8 @@ Create a new sandbox. Memory in MB. Must match an allowed tier: `1024`, `4096`, `8192`, `16384`, `32768`, or `65536`. If omitted but `cpuCount` is set, inferred automatically.
- - Create a [resumable sandbox](/sandboxes/resumable-sandboxes). Disk is preserved across infrastructure restarts; processes may restart. + + Create a [Burst Sandbox](/sandboxes/burst-sandboxes). Disk is preserved across infrastructure restarts; processes may restart. The allowed CPU/memory combinations are: diff --git a/docs/docs.json b/docs/docs.json index 493c780c..365ca4de 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -47,10 +47,10 @@ "sandboxes/preview-urls", "sandboxes/elasticity", { - "group": "Resumable Sandboxes", + "group": "Burst Sandboxes", "tag": "Alpha", "pages": [ - "sandboxes/resumable-sandboxes" + "sandboxes/burst-sandboxes" ] }, { diff --git a/docs/reference/python-sdk/sandbox.mdx b/docs/reference/python-sdk/sandbox.mdx index 97347f61..7a0855da 100644 --- a/docs/reference/python-sdk/sandbox.mdx +++ b/docs/reference/python-sdk/sandbox.mdx @@ -33,8 +33,8 @@ Create a new sandbox. [HTTP API →](/api-reference/sandboxes/create) Arbitrary metadata - - Create a [resumable sandbox](/sandboxes/resumable-sandboxes). Disk is preserved across infrastructure restarts; processes may restart. + + Create a [Burst Sandbox](/sandboxes/burst-sandboxes). Disk is preserved across infrastructure restarts; processes may restart. @@ -55,17 +55,17 @@ Create a new sandbox. [HTTP API →](/api-reference/sandboxes/create) sandbox = await Sandbox.create(template="my-stack", timeout=600) ``` -Create a resumable sandbox: +Create a Burst Sandbox: ```python sandbox = await Sandbox.create( - resumable=True, + burst=True, timeout=300, ) ``` - Resumable sandboxes are alpha. They preserve filesystem state across + Burst Sandboxes are alpha. They preserve filesystem state across infrastructure restarts, may restart running processes, and are priced roughly 2x cheaper than on-demand sandboxes. diff --git a/docs/reference/typescript-sdk/sandbox.mdx b/docs/reference/typescript-sdk/sandbox.mdx index 9c351332..ef130807 100644 --- a/docs/reference/typescript-sdk/sandbox.mdx +++ b/docs/reference/typescript-sdk/sandbox.mdx @@ -33,8 +33,8 @@ Create a new sandbox. [HTTP API →](/api-reference/sandboxes/create) Arbitrary metadata - - Create a [resumable sandbox](/sandboxes/resumable-sandboxes). Disk is preserved across infrastructure restarts; processes may restart. + + Create a [Burst Sandbox](/sandboxes/burst-sandboxes). Disk is preserved across infrastructure restarts; processes may restart. @@ -67,17 +67,17 @@ Create a new sandbox. [HTTP API →](/api-reference/sandboxes/create) const sandbox = await Sandbox.create({ template: "my-stack", timeout: 600 }); ``` -Create a resumable sandbox: +Create a Burst Sandbox: ```typescript const sandbox = await Sandbox.create({ - resumable: true, + burst: true, timeout: 300, }); ``` - Resumable sandboxes are alpha. They preserve filesystem state across + Burst Sandboxes are alpha. They preserve filesystem state across infrastructure restarts, may restart running processes, and are priced roughly 2x cheaper than on-demand sandboxes. diff --git a/docs/sandboxes/resumable-sandboxes.mdx b/docs/sandboxes/burst-sandboxes.mdx similarity index 82% rename from docs/sandboxes/resumable-sandboxes.mdx rename to docs/sandboxes/burst-sandboxes.mdx index bd6f03cf..90bc99fc 100644 --- a/docs/sandboxes/resumable-sandboxes.mdx +++ b/docs/sandboxes/burst-sandboxes.mdx @@ -1,25 +1,25 @@ --- -title: "Resumable Sandboxes" +title: "Burst Sandboxes" description: "Alpha lower-cost sandboxes that preserve disk across infrastructure restarts" --- - Resumable sandboxes are in alpha. They preserve filesystem state across + Burst Sandboxes are in alpha. They preserve filesystem state across infrastructure restarts, but running processes, in-memory state, terminal sessions, and open network connections may restart. -Resumable sandboxes are lower-cost OpenComputer sandboxes designed for workloads that can recover from a process restart. They expose the same OpenComputer API for commands, files, PTY, preview URLs, images, snapshots, scaling, and lifecycle operations. +Burst Sandboxes are lower-cost OpenComputer sandboxes designed for workloads that can recover from a process restart. They expose the same OpenComputer API for commands, files, PTY, preview URLs, images, snapshots, scaling, and lifecycle operations. If OpenComputer receives advance infrastructure interruption notice, the sandbox gets up to **25 seconds** to flush state before it restarts on healthy capacity. The sandbox's filesystem is preserved and made available after resume. ## Pricing -Resumable sandboxes are priced roughly **2x cheaper than on-demand sandboxes**. +Burst Sandboxes are priced roughly **2x cheaper than on-demand sandboxes**. -## Create a Resumable Sandbox +## Create a Burst Sandbox -Set `resumable: true` when creating the sandbox. +Set `burst: true` when creating the sandbox. @@ -27,12 +27,12 @@ Set `resumable: true` when creating the sandbox. import { Sandbox } from "@opencomputer/sdk"; const sandbox = await Sandbox.create({ - resumable: true, + burst: true, memoryMB: 4096, timeout: 300, }); -const result = await sandbox.exec.run("echo hello from resumable"); +const result = await sandbox.exec.run("echo hello from burst"); console.log(result.stdout); await sandbox.kill(); @@ -42,11 +42,11 @@ await sandbox.kill(); from opencomputer import Sandbox async with await Sandbox.create( - resumable=True, + burst=True, memory_mb=4096, timeout=300, ) as sandbox: - result = await sandbox.exec.run("echo hello from resumable") + result = await sandbox.exec.run("echo hello from burst") print(result.stdout) ``` @@ -55,7 +55,7 @@ curl -X POST https://app.opencomputer.dev/api/sandboxes \ -H "X-API-Key: $OPENCOMPUTER_API_KEY" \ -H "Content-Type: application/json" \ -d '{ - "resumable": true, + "burst": true, "memoryMB": 4096, "timeout": 300 }' @@ -65,10 +65,11 @@ curl -X POST https://app.opencomputer.dev/api/sandboxes \ ## What Happens on Interruption -Infrastructure capacity can be reclaimed or restarted by the cloud provider. When OpenComputer receives advance notice, it notifies the sandbox and starts the resumability flow. +Infrastructure capacity can be reclaimed or restarted by the cloud provider. When OpenComputer receives advance notice, it notifies the sandbox and starts the Burst restart flow. The sandbox receives these environment variables: +- `OPENSANDBOX_BURST=true` - `OPENSANDBOX_RESUMABLE=true` - `OPENSANDBOX_RESUME_NOTICE_SECONDS=25` @@ -76,7 +77,7 @@ During the notice window, write important state to disk. After resume, restart y ## Good Fits -Resumable sandboxes work well for cost-sensitive workloads that can restart from disk: +Burst Sandboxes work well for cost-sensitive workloads that can restart from disk: - Batch code execution where failed jobs can be requeued. - CI-style checks, test runners, linters, and formatters. @@ -96,15 +97,15 @@ Use on-demand sandboxes when process continuity is required: ## Alpha Limitations -During alpha, resumable sandboxes have these restrictions: +During alpha, Burst Sandboxes have these restrictions: -| Capability | Resumable alpha behavior | +| Capability | Burst alpha behavior | | --- | --- | | Process state | May restart | -| Filesystem | Preserved across resumable restarts | +| Filesystem | Preserved across Burst restarts | | Notice window | Up to 25 seconds when advance notice is available | | Sudden host failure | May resume without advance notice | -| Capacity | Best effort; create may fail or wait when resumable capacity is full | +| Capacity | Best effort; create may fail or wait when Burst capacity is full | ## Reliability Pattern @@ -124,7 +125,7 @@ Example setup: import { Sandbox } from "@opencomputer/sdk"; const sandbox = await Sandbox.create({ - resumable: true, + burst: true, timeout: 0, }); @@ -173,7 +174,7 @@ await sandbox.exec.background("/home/sandbox/app/worker.sh", { ```python Python from opencomputer import Sandbox -sandbox = await Sandbox.create(resumable=True, timeout=0) +sandbox = await Sandbox.create(burst=True, timeout=0) await sandbox.exec.run("mkdir -p /home/sandbox/.opencomputer /home/sandbox/app") diff --git a/internal/api/sandbox.go b/internal/api/sandbox.go index 0537f4e8..62e3f257 100644 --- a/internal/api/sandbox.go +++ b/internal/api/sandbox.go @@ -37,7 +37,7 @@ func (s *Server) createSandbox(c echo.Context) error { "error": err.Error(), }) } - cfg = withResumableSandboxEnv(cfg) + cfg = withBurstSandboxEnv(cfg) // Validate CPU/memory against allowed tiers. // Allowed tiers (memoryMB → vCPU): 1024→1, 4096→1, 8192→2, 16384→4, 32768→8, 65536→16. @@ -186,7 +186,7 @@ func (s *Server) createSandbox(c echo.Context) error { }) } sb.SandboxFamily = cfg.SandboxFamily - sb.Resumable = cfg.IsResumable() + sb.Burst = cfg.IsResumable() // Register with sandbox router for rolling timeout tracking. // timeout == 0 means "persistent" (no auto-hibernate). Negative values are @@ -503,7 +503,11 @@ func (s *Server) createSandboxRemote(c echo.Context, ctx context.Context, cfg ty region = "iad" } - worker, grpcClient, err := s.workerRegistry.GetLeastLoadedWorker(region) + workerPool := controlplane.WorkerPoolOnDemand + if cfg.IsResumable() { + workerPool = controlplane.WorkerPoolBurst + } + worker, grpcClient, err := s.workerRegistry.GetLeastLoadedWorkerForPool(region, workerPool) if err != nil { // No worker immediately available — poll for up to 30s // (scaler may be launching a new worker) @@ -514,17 +518,17 @@ func (s *Server) createSandboxRemote(c echo.Context, ctx context.Context, cfg ty select { case <-deadline: return c.JSON(http.StatusServiceUnavailable, map[string]string{ - "error": "no workers available in region " + region + " (waited 30s)", + "error": fmt.Sprintf("no %s workers available in region %s (waited 30s)", workerPool, region), }) case <-ctx.Done(): return c.JSON(http.StatusServiceUnavailable, map[string]string{ "error": "request cancelled while waiting for capacity", }) case <-ticker.C: - worker, grpcClient, err = s.workerRegistry.GetLeastLoadedWorker(region) + worker, grpcClient, err = s.workerRegistry.GetLeastLoadedWorkerForPool(region, workerPool) } } - log.Printf("sandbox: worker became available after queuing (region=%s)", region) + log.Printf("sandbox: %s worker became available after queuing (region=%s)", workerPool, region) } // Resolve template (org-scoped lookup with public fallback). @@ -687,7 +691,7 @@ func (s *Server) createSandboxRemote(c echo.Context, ctx context.Context, cfg ty "cpuCount": cfg.CpuCount, "memoryMB": cfg.MemoryMB, "sandboxFamily": cfg.SandboxFamily, - "resumable": cfg.IsResumable(), + "burst": cfg.IsResumable(), } if s.sandboxDomain != "" { resp["sandboxDomain"] = s.sandboxDomain @@ -1213,14 +1217,15 @@ func isSpotSandboxSession(session *db.SandboxSession) bool { return sandboxSessionFamily(session) == types.SandboxFamilySpot } -func withResumableSandboxEnv(cfg types.SandboxConfig) types.SandboxConfig { +func withBurstSandboxEnv(cfg types.SandboxConfig) types.SandboxConfig { if !cfg.IsResumable() { return cfg } - envs := make(map[string]string, len(cfg.Envs)+2) + envs := make(map[string]string, len(cfg.Envs)+3) for k, v := range cfg.Envs { envs[k] = v } + envs["OPENSANDBOX_BURST"] = "true" envs["OPENSANDBOX_RESUMABLE"] = "true" envs["OPENSANDBOX_RESUME_NOTICE_SECONDS"] = "25" cfg.Envs = envs diff --git a/internal/compute/ec2.go b/internal/compute/ec2.go index 52810b85..cf17c670 100644 --- a/internal/compute/ec2.go +++ b/internal/compute/ec2.go @@ -63,6 +63,7 @@ func supportsEC2NestedVirtualization(instanceType string) bool { const ( // AWS tag keys (kept consistent with the Azure pool's azure-prefixed tags). awsTagRole = "opensandbox:role" + awsTagCell = "opensandbox:cell" awsTagInstanceType = "opensandbox:instance-type" awsTagDraining = "opensandbox:draining" awsTagWorker = "worker" @@ -70,17 +71,24 @@ const ( // EC2PoolConfig configures the EC2 compute pool. type EC2PoolConfig struct { - Region string - AccessKeyID string // empty = use default credential chain (IAM role preferred) - SecretAccessKey string - AMI string // static AMI ID; empty if SSMParameterName is set - InstanceType string // e.g. "c7gd.metal", "r7gd.xlarge", "m7i.large" - SubnetID string - SecurityGroupID string - KeyName string // optional SSH key pair (debug use only) - IAMInstanceProfile string // attached to instances; gives them Secrets Manager + S3 read - SecretsARN string // Secrets Manager ARN; passed to worker via WorkerSpec.SecretsRef - SSMParameterName string // SSM parameter for dynamic AMI ID (e.g. /opensandbox/dev/worker-ami-id) + Region string + AccessKeyID string // empty = use default credential chain (IAM role preferred) + SecretAccessKey string + AMI string // static AMI ID; empty if SSMParameterName is set + InstanceType string // e.g. "c7gd.metal", "r7gd.xlarge", "m7i.large" + SubnetID string + SecurityGroupID string + KeyName string // optional SSH key pair (debug use only) + IAMInstanceProfile string // attached to instances; gives them Secrets Manager + S3 read + SecretsARN string // Secrets Manager ARN; passed to worker via WorkerSpec.SecretsRef + SSMParameterName string // SSM parameter for dynamic AMI ID (e.g. /opensandbox/dev/worker-ami-id) + MarketType string // empty/on-demand or spot + CellID string + SharedSandboxDataVolumeID string // optional io2 Multi-Attach volume mounted at /data/sandboxes via OCFS2 + SharedGoldensVolumeID string // optional io2 Multi-Attach volume for golden image cache + OCFS2ClusterName string + OCFS2ExpectedNodes int + OCFS2MaxNodes int } // EC2Pool implements compute.Pool using AWS EC2 instances. @@ -162,6 +170,25 @@ func (p *EC2Pool) CreateMachine(ctx context.Context, opts MachineOpts) (*Machine userData := p.buildUserData(opts) machineName := fmt.Sprintf("osb-worker-%s", randomSuffix()) + instanceTags := []ec2types.Tag{ + {Key: aws.String("Name"), Value: aws.String(machineName)}, + {Key: aws.String("Role"), Value: aws.String("worker")}, + {Key: aws.String(awsTagRole), Value: aws.String(awsTagWorker)}, + {Key: aws.String(awsTagInstanceType), Value: aws.String(instanceType)}, + } + volumeTags := []ec2types.Tag{ + {Key: aws.String(awsTagRole), Value: aws.String(awsTagWorker)}, + } + if p.cfg.CellID != "" { + instanceTags = append(instanceTags, + ec2types.Tag{Key: aws.String("Cell"), Value: aws.String(p.cfg.CellID)}, + ec2types.Tag{Key: aws.String(awsTagCell), Value: aws.String(p.cfg.CellID)}, + ) + volumeTags = append(volumeTags, + ec2types.Tag{Key: aws.String("Cell"), Value: aws.String(p.cfg.CellID)}, + ec2types.Tag{Key: aws.String(awsTagCell), Value: aws.String(p.cfg.CellID)}, + ) + } input := &ec2.RunInstancesInput{ ImageId: aws.String(ami), @@ -172,20 +199,23 @@ func (p *EC2Pool) CreateMachine(ctx context.Context, opts MachineOpts) (*Machine TagSpecifications: []ec2types.TagSpecification{ { ResourceType: ec2types.ResourceTypeInstance, - Tags: []ec2types.Tag{ - {Key: aws.String("Name"), Value: aws.String(machineName)}, - {Key: aws.String(awsTagRole), Value: aws.String(awsTagWorker)}, - {Key: aws.String(awsTagInstanceType), Value: aws.String(instanceType)}, - }, + Tags: instanceTags, }, { ResourceType: ec2types.ResourceTypeVolume, - Tags: []ec2types.Tag{ - {Key: aws.String(awsTagRole), Value: aws.String(awsTagWorker)}, - }, + Tags: volumeTags, }, }, } + if strings.EqualFold(p.cfg.MarketType, "spot") { + input.InstanceMarketOptions = &ec2types.InstanceMarketOptionsRequest{ + MarketType: ec2types.MarketTypeSpot, + SpotOptions: &ec2types.SpotMarketOptions{ + InstanceInterruptionBehavior: ec2types.InstanceInterruptionBehaviorTerminate, + SpotInstanceType: ec2types.SpotInstanceTypeOneTime, + }, + } + } if supportsEC2NestedVirtualization(instanceType) { input.CpuOptions = &ec2types.CpuOptionsRequest{ @@ -437,6 +467,15 @@ func (p *EC2Pool) buildUserData(opts MachineOpts) string { _ = opts // opts.Region/Size honored at instance launch; cloud-init is cell-uniform var sb strings.Builder sb.WriteString("#!/bin/bash\nset -euo pipefail\n\n") + sb.WriteString("systemctl stop opensandbox-worker.service 2>/dev/null || true\n") + sb.WriteString("systemctl disable opensandbox-worker.service 2>/dev/null || true\n") + sb.WriteString("systemctl reset-failed opensandbox-worker.service 2>/dev/null || true\n\n") + + sb.WriteString("# Instance identity from EC2 metadata (IMDSv2)\n") + sb.WriteString("TOKEN=$(curl -fsS -X PUT 'http://169.254.169.254/latest/api/token' -H 'X-aws-ec2-metadata-token-ttl-seconds: 300')\n") + sb.WriteString("MY_IP=$(curl -fsS -H \"X-aws-ec2-metadata-token: $TOKEN\" http://169.254.169.254/latest/meta-data/local-ipv4)\n") + sb.WriteString("INSTANCE_ID=$(curl -fsS -H \"X-aws-ec2-metadata-token: $TOKEN\" http://169.254.169.254/latest/meta-data/instance-id)\n") + sb.WriteString("WORKER_ID=\"w-aws-${INSTANCE_ID}\"\n\n") // NVMe instance store handling. Larger metal/x.gd instance families expose // multiple NVMe drives at /dev/nvme[1-N]n1; smaller instances rely on EBS @@ -464,6 +503,14 @@ func (p *EC2Pool) buildUserData(opts MachineOpts) string { sb.WriteString(" fi\n") sb.WriteString("fi\n") sb.WriteString("mkdir -p /data/sandboxes /data/firecracker/images\n") + + if p.cfg.SharedSandboxDataVolumeID != "" { + sb.WriteString(p.sharedSandboxDataUserData()) + } + if p.cfg.SharedGoldensVolumeID != "" { + sb.WriteString(p.sharedGoldensUserData()) + } + sb.WriteString("# Copy AMI-baked rootfs images to data disk if not already present\n") sb.WriteString("if [ -d /opt/opensandbox/images ] && [ ! -f /data/firecracker/images/default.ext4 ]; then\n") sb.WriteString(" cp /opt/opensandbox/images/*.ext4 /data/firecracker/images/ 2>/dev/null || true\n") @@ -483,10 +530,6 @@ func (p *EC2Pool) buildUserData(opts MachineOpts) string { sb.WriteString(fmt.Sprintf("echo '%s' | base64 -d > /etc/opensandbox/worker.env\n\n", envB64)) sb.WriteString("# Patch worker identity from EC2 instance metadata (IMDSv2)\n") - sb.WriteString("TOKEN=$(curl -s -X PUT 'http://169.254.169.254/latest/api/token' -H 'X-aws-ec2-metadata-token-ttl-seconds: 300')\n") - sb.WriteString("MY_IP=$(curl -s -H \"X-aws-ec2-metadata-token: $TOKEN\" http://169.254.169.254/latest/meta-data/local-ipv4)\n") - sb.WriteString("INSTANCE_ID=$(curl -s -H \"X-aws-ec2-metadata-token: $TOKEN\" http://169.254.169.254/latest/meta-data/instance-id)\n") - sb.WriteString("WORKER_ID=\"w-aws-${INSTANCE_ID}\"\n") sb.WriteString("sed -i \"s|OPENSANDBOX_GRPC_ADVERTISE=.*|OPENSANDBOX_GRPC_ADVERTISE=${MY_IP}:9090|\" /etc/opensandbox/worker.env\n") sb.WriteString("sed -i \"s|OPENSANDBOX_HTTP_ADDR=.*|OPENSANDBOX_HTTP_ADDR=http://${MY_IP}:8081|\" /etc/opensandbox/worker.env\n") sb.WriteString("sed -i \"s|OPENSANDBOX_WORKER_ID=.*|OPENSANDBOX_WORKER_ID=${WORKER_ID}|\" /etc/opensandbox/worker.env\n") @@ -501,3 +544,98 @@ func (p *EC2Pool) buildUserData(opts MachineOpts) string { return sb.String() } + +func (p *EC2Pool) sharedSandboxDataUserData() string { + clusterName := p.cfg.OCFS2ClusterName + if clusterName == "" { + clusterName = "opensandbox" + } + expectedNodes := p.cfg.OCFS2ExpectedNodes + if expectedNodes <= 0 { + expectedNodes = 1 + } + maxNodes := p.cfg.OCFS2MaxNodes + if maxNodes <= 0 { + maxNodes = expectedNodes + } + if maxNodes < expectedNodes { + maxNodes = expectedNodes + } + + var sb strings.Builder + sb.WriteString("# Shared sandbox data: OCFS2 over io2 Multi-Attach\n") + sb.WriteString("if ! command -v mount.ocfs2 >/dev/null 2>&1; then\n") + sb.WriteString(" for i in $(seq 1 120); do\n") + sb.WriteString(" fuser /var/lib/dpkg/lock-frontend >/dev/null 2>&1 || fuser /var/lib/dpkg/lock >/dev/null 2>&1 || fuser /var/lib/apt/lists/lock >/dev/null 2>&1 || break\n") + sb.WriteString(" sleep 2\n") + sb.WriteString(" done\n") + sb.WriteString(" apt-get update\n") + sb.WriteString(" DEBIAN_FRONTEND=noninteractive apt-get install -y ocfs2-tools \"linux-modules-extra-$(uname -r)\"\n") + sb.WriteString("fi\n") + sb.WriteString(fmt.Sprintf("SANDBOX_VOLUME_ID=%q\n", p.cfg.SharedSandboxDataVolumeID)) + sb.WriteString(fmt.Sprintf("OCFS2_CLUSTER_NAME=%q\n", clusterName)) + sb.WriteString(fmt.Sprintf("OCFS2_EXPECTED_NODES=%d\n", expectedNodes)) + sb.WriteString(fmt.Sprintf("OCFS2_MAX_NODES=%d\n", maxNodes)) + sb.WriteString("aws ec2 attach-volume --region " + shellQuote(p.cfg.Region) + " --volume-id \"$SANDBOX_VOLUME_ID\" --instance-id \"$INSTANCE_ID\" --device /dev/sdg || true\n") + sb.WriteString("SANDBOX_DEV=\"\"\n") + sb.WriteString("SANDBOX_VOL_NO_DASH=\"${SANDBOX_VOLUME_ID//-/}\"\n") + sb.WriteString("for i in $(seq 1 180); do\n") + sb.WriteString(" if [ -e \"/dev/disk/by-id/nvme-Amazon_Elastic_Block_Store_${SANDBOX_VOL_NO_DASH}\" ]; then\n") + sb.WriteString(" SANDBOX_DEV=$(readlink -f \"/dev/disk/by-id/nvme-Amazon_Elastic_Block_Store_${SANDBOX_VOL_NO_DASH}\")\n") + sb.WriteString(" elif [ -e \"/dev/disk/by-id/nvme-Amazon_Elastic_Block_Store_${SANDBOX_VOL_NO_DASH}_1\" ]; then\n") + sb.WriteString(" SANDBOX_DEV=$(readlink -f \"/dev/disk/by-id/nvme-Amazon_Elastic_Block_Store_${SANDBOX_VOL_NO_DASH}_1\")\n") + sb.WriteString(" else\n") + sb.WriteString(" SANDBOX_DEV=$(lsblk -dn -o NAME,SERIAL | awk -v v=\"$SANDBOX_VOL_NO_DASH\" '$2 == v {print \"/dev/\"$1; exit}')\n") + sb.WriteString(" fi\n") + sb.WriteString(" [ -n \"${SANDBOX_DEV:-}\" ] && break\n") + sb.WriteString(" sleep 1\n") + sb.WriteString("done\n") + sb.WriteString("if [ -z \"${SANDBOX_DEV:-}\" ]; then echo \"ERROR: shared sandbox data volume not attached\"; lsblk -o NAME,MODEL,SERIAL,SIZE,FSTYPE,MOUNTPOINT || true; exit 1; fi\n") + sb.WriteString("mapfile -t OCFS2_NODES < <(for i in $(seq 1 60); do aws ec2 describe-instances --region " + shellQuote(p.cfg.Region) + " --filters \"Name=tag:Cell,Values=" + shellEscapedDouble(p.cfg.CellID) + "\" \"Name=tag:Role,Values=worker\" \"Name=instance-state-name,Values=running\" --query 'Reservations[].Instances[].PrivateDnsName' --output text | tr '\\t' '\\n' | awk 'NF { sub(/\\..*/, \"\", $0); print }' | sort -u; break; done)\n") + sb.WriteString("for i in $(seq 1 60); do\n") + sb.WriteString(" [ \"${#OCFS2_NODES[@]}\" -ge \"$OCFS2_EXPECTED_NODES\" ] && break\n") + sb.WriteString(" sleep 2\n") + sb.WriteString(" mapfile -t OCFS2_NODES < <(aws ec2 describe-instances --region " + shellQuote(p.cfg.Region) + " --filters \"Name=tag:Cell,Values=" + shellEscapedDouble(p.cfg.CellID) + "\" \"Name=tag:Role,Values=worker\" \"Name=instance-state-name,Values=running\" --query 'Reservations[].Instances[].PrivateDnsName' --output text | tr '\\t' '\\n' | awk 'NF { sub(/\\..*/, \"\", $0); print }' | sort -u)\n") + sb.WriteString("done\n") + sb.WriteString("if [ \"${#OCFS2_NODES[@]}\" -lt \"$OCFS2_EXPECTED_NODES\" ]; then echo \"ERROR: found ${#OCFS2_NODES[@]} OCFS2 nodes, expected $OCFS2_EXPECTED_NODES\"; exit 1; fi\n") + sb.WriteString("install -d -m 0755 /etc/ocfs2 /etc/sysconfig\n") + sb.WriteString("{ echo \"cluster:\"; echo \" node_count = ${#OCFS2_NODES[@]}\"; echo \" name = $OCFS2_CLUSTER_NAME\"; echo \"\"; n=0; for node in \"${OCFS2_NODES[@]}\"; do ip=$(getent ahostsv4 \"$node\" | awk '{print $1; exit}'); [ -n \"${ip:-}\" ] || { echo \"ERROR: could not resolve OCFS2 node $node\"; exit 1; }; echo \"node:\"; echo \" ip_port = 7777\"; echo \" ip_address = $ip\"; echo \" number = $n\"; echo \" name = $node\"; echo \" cluster = $OCFS2_CLUSTER_NAME\"; echo \"\"; n=$((n + 1)); done; } > /etc/ocfs2/cluster.conf\n") + sb.WriteString("cat > /etc/default/o2cb </dev/null 2>&1 && o2cb register-cluster \"$OCFS2_CLUSTER_NAME\" || true\n") + sb.WriteString("[ -x /etc/init.d/o2cb ] && /etc/init.d/o2cb online \"$OCFS2_CLUSTER_NAME\" || true\n") + sb.WriteString("mkdir -p /data/sandboxes\n") + sb.WriteString("FSTYPE=$(blkid -s TYPE -o value \"$SANDBOX_DEV\" 2>/dev/null || true)\n") + sb.WriteString("if [ -z \"$FSTYPE\" ]; then mkfs.ocfs2 -F -N \"$OCFS2_MAX_NODES\" -L opensandbox-sandboxes -T vmstore \"$SANDBOX_DEV\"; fi\n") + sb.WriteString("if ! grep -q 'LABEL=opensandbox-sandboxes' /etc/fstab; then echo 'LABEL=opensandbox-sandboxes /data/sandboxes ocfs2 noauto,_netdev,noatime 0 0' >> /etc/fstab; fi\n") + sb.WriteString("timeout 90 mount -t ocfs2 -o noatime LABEL=opensandbox-sandboxes /data/sandboxes\n") + sb.WriteString("chown root:root /data/sandboxes\n\n") + return sb.String() +} + +func (p *EC2Pool) sharedGoldensUserData() string { + var sb strings.Builder + sb.WriteString("# Shared golden image volume\n") + sb.WriteString("mkdir -p /opt/opensandbox/goldens-shared /var/lib/opensandbox/golden\n") + sb.WriteString(fmt.Sprintf("GOLDENS_VOLUME_ID=%q\n", p.cfg.SharedGoldensVolumeID)) + sb.WriteString("aws ec2 attach-volume --region " + shellQuote(p.cfg.Region) + " --volume-id \"$GOLDENS_VOLUME_ID\" --instance-id \"$INSTANCE_ID\" --device /dev/sdf || true\n") + sb.WriteString("GOLDENS_DEV=\"\"\n") + sb.WriteString("GOLDENS_VOL_NO_DASH=\"${GOLDENS_VOLUME_ID//-/}\"\n") + sb.WriteString("for i in $(seq 1 120); do\n") + sb.WriteString(" if [ -e \"/dev/disk/by-id/nvme-Amazon_Elastic_Block_Store_${GOLDENS_VOL_NO_DASH}\" ]; then GOLDENS_DEV=$(readlink -f \"/dev/disk/by-id/nvme-Amazon_Elastic_Block_Store_${GOLDENS_VOL_NO_DASH}\"); fi\n") + sb.WriteString(" [ -n \"${GOLDENS_DEV:-}\" ] && break\n") + sb.WriteString(" sleep 1\n") + sb.WriteString("done\n") + sb.WriteString("if [ -n \"${GOLDENS_DEV:-}\" ]; then mount -o ro,noload \"$GOLDENS_DEV\" /opt/opensandbox/goldens-shared || true; fi\n") + sb.WriteString("if [ -d /opt/opensandbox/goldens-shared/golden ]; then ln -sfn /opt/opensandbox/goldens-shared/golden /var/lib/opensandbox/golden; fi\n\n") + return sb.String() +} + +func shellQuote(s string) string { + return "'" + strings.ReplaceAll(s, "'", "'\"'\"'") + "'" +} + +func shellEscapedDouble(s string) string { + return strings.ReplaceAll(s, `"`, `\"`) +} diff --git a/internal/compute/pool.go b/internal/compute/pool.go index 901388cf..a974a859 100644 --- a/internal/compute/pool.go +++ b/internal/compute/pool.go @@ -40,8 +40,9 @@ type MachineOpts struct { // cloud means writing a new Pool; the CP doesn't change. type WorkerSpec struct { // Cell + region identity - CellID string // "{cloud}-{region}-{slot}", e.g. "azure-us-west-2-b" - Region string // AWS-style hyphenated, e.g. "us-west-2", "us-east-1" + CellID string // "{cloud}-{region}-{slot}", e.g. "azure-us-west-2-b" + Region string // AWS-style hyphenated, e.g. "us-west-2", "us-east-1" + WorkerPool string // "ondemand" or "burst"; empty means ondemand // Connectivity back to the control plane DatabaseURL string @@ -128,6 +129,7 @@ func BuildWorkerEnv(spec WorkerSpec) string { {"OPENSANDBOX_JWT_SECRET", spec.JWTSecret}, {"OPENSANDBOX_REGION", spec.Region}, {"OPENSANDBOX_CELL_ID", spec.CellID}, + {"OPENSANDBOX_WORKER_POOL", spec.WorkerPool}, {"OPENSANDBOX_MAX_CAPACITY", itoa(spec.MaxCapacity)}, {"OPENSANDBOX_DEFAULT_SANDBOX_MEMORY_MB", itoa(spec.DefaultMemoryMB)}, {"OPENSANDBOX_DEFAULT_SANDBOX_CPUS", itoa(spec.DefaultCPUs)}, diff --git a/internal/config/config.go b/internal/config/config.go index b8a7f4a5..ff361e81 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -104,17 +104,26 @@ type Config struct { ImagesDir string // Path to base rootfs images GoldenDir string // Path to worker-local golden VM snapshot cache QEMUBin string // Path to qemu-system binary (default: "qemu-system-x86_64") + WorkerPool string // Worker placement pool advertised in heartbeats: "ondemand" (default) or "burst" // AWS EC2 compute pool (server mode only — for auto-scaling worker machines) - EC2AMI string // Custom AMI for worker instances - EC2InstanceType string // single fallback type; used only when EC2InstanceTypes is empty - EC2InstanceTypes []string // ranked list of instance types tried in order on quota/capacity errors - EC2SubnetID string // VPC subnet for worker instances - EC2SecurityGroupID string // Security group (allow 8080, 9090, 9091) - EC2KeyName string // SSH key pair name (for debugging) - EC2WorkerImage string // Docker image for containerized workers - EC2IAMInstanceProfile string // IAM instance profile for worker instances (Secrets Manager + S3) - EC2SSMParameterName string // SSM parameter name for dynamic AMI ID (e.g. /opensandbox/prod/worker-ami-id) + EC2AMI string // Custom AMI for worker instances + EC2InstanceType string // single fallback type; used only when EC2InstanceTypes is empty + EC2InstanceTypes []string // ranked list of instance types tried in order on quota/capacity errors + EC2SubnetID string // VPC subnet for worker instances + EC2SecurityGroupID string // Security group (allow 8080, 9090, 9091) + EC2KeyName string // SSH key pair name (for debugging) + EC2WorkerImage string // Docker image for containerized workers + EC2IAMInstanceProfile string // IAM instance profile for worker instances (Secrets Manager + S3) + EC2SSMParameterName string // SSM parameter name for dynamic AMI ID (e.g. /opensandbox/prod/worker-ami-id) + EC2MarketType string // empty/on-demand or spot + EC2AccessKeyID string // optional EC2-specific static credentials; empty = IAM/default chain + EC2SecretAccessKey string + EC2SharedSandboxDataVolumeID string // optional io2 Multi-Attach volume mounted at /data/sandboxes + EC2SharedGoldensVolumeID string // optional io2 Multi-Attach volume mounted read-only for golden images + EC2OCFS2ClusterName string + EC2OCFS2ExpectedNodes int + EC2OCFS2MaxNodes int // Azure compute pool (server mode — for auto-scaling worker VMs) AzureSubscriptionID string // Azure subscription ID @@ -344,16 +353,25 @@ func Load() (*Config, error) { ImagesDir: os.Getenv("OPENSANDBOX_IMAGES_DIR"), // default derived from DataDir GoldenDir: os.Getenv("OPENSANDBOX_GOLDEN_DIR"), // default derived from DataDir QEMUBin: envOrDefault("OPENSANDBOX_QEMU_BIN", "qemu-system-x86_64"), - - EC2AMI: os.Getenv("OPENSANDBOX_EC2_AMI"), - EC2InstanceType: envOrDefault("OPENSANDBOX_EC2_INSTANCE_TYPE", "c7gd.metal"), - EC2InstanceTypes: splitCSV(os.Getenv("OPENSANDBOX_EC2_INSTANCE_TYPES")), - EC2SubnetID: os.Getenv("OPENSANDBOX_EC2_SUBNET_ID"), - EC2SecurityGroupID: os.Getenv("OPENSANDBOX_EC2_SECURITY_GROUP_ID"), - EC2KeyName: os.Getenv("OPENSANDBOX_EC2_KEY_NAME"), - EC2WorkerImage: envOrDefault("OPENSANDBOX_EC2_WORKER_IMAGE", "opensandbox-worker:latest"), - EC2IAMInstanceProfile: os.Getenv("OPENSANDBOX_EC2_IAM_INSTANCE_PROFILE"), - EC2SSMParameterName: os.Getenv("OPENSANDBOX_EC2_SSM_AMI_PARAM"), + WorkerPool: envOrDefault("OPENSANDBOX_WORKER_POOL", "ondemand"), + + EC2AMI: os.Getenv("OPENSANDBOX_EC2_AMI"), + EC2InstanceType: envOrDefault("OPENSANDBOX_EC2_INSTANCE_TYPE", "c7gd.metal"), + EC2InstanceTypes: splitCSV(os.Getenv("OPENSANDBOX_EC2_INSTANCE_TYPES")), + EC2SubnetID: os.Getenv("OPENSANDBOX_EC2_SUBNET_ID"), + EC2SecurityGroupID: os.Getenv("OPENSANDBOX_EC2_SECURITY_GROUP_ID"), + EC2KeyName: os.Getenv("OPENSANDBOX_EC2_KEY_NAME"), + EC2WorkerImage: envOrDefault("OPENSANDBOX_EC2_WORKER_IMAGE", "opensandbox-worker:latest"), + EC2IAMInstanceProfile: os.Getenv("OPENSANDBOX_EC2_IAM_INSTANCE_PROFILE"), + EC2SSMParameterName: os.Getenv("OPENSANDBOX_EC2_SSM_AMI_PARAM"), + EC2MarketType: os.Getenv("OPENSANDBOX_EC2_MARKET_TYPE"), + EC2AccessKeyID: os.Getenv("OPENSANDBOX_EC2_ACCESS_KEY_ID"), + EC2SecretAccessKey: os.Getenv("OPENSANDBOX_EC2_SECRET_ACCESS_KEY"), + EC2SharedSandboxDataVolumeID: os.Getenv("OPENSANDBOX_SHARED_SANDBOX_DATA_VOLUME_ID"), + EC2SharedGoldensVolumeID: os.Getenv("OPENSANDBOX_SHARED_GOLDENS_VOLUME_ID"), + EC2OCFS2ClusterName: envOrDefault("OPENSANDBOX_OCFS2_CLUSTER_NAME", "opensandbox"), + EC2OCFS2ExpectedNodes: envOrDefaultInt("OPENSANDBOX_OCFS2_EXPECTED_NODES", 1), + EC2OCFS2MaxNodes: envOrDefaultInt("OPENSANDBOX_OCFS2_MAX_NODES", 4), AzureSubscriptionID: os.Getenv("OPENSANDBOX_AZURE_SUBSCRIPTION_ID"), AzureResourceGroup: os.Getenv("OPENSANDBOX_AZURE_RESOURCE_GROUP"), diff --git a/internal/controlplane/reconcile.go b/internal/controlplane/reconcile.go index aae73f3d..bbf952e4 100644 --- a/internal/controlplane/reconcile.go +++ b/internal/controlplane/reconcile.go @@ -223,10 +223,11 @@ func recreateResumableSandbox(ctx context.Context, registry *RedisWorkerRegistry if cfg.Envs == nil { cfg.Envs = map[string]string{} } + cfg.Envs["OPENSANDBOX_BURST"] = "true" cfg.Envs["OPENSANDBOX_RESUMABLE"] = "true" cfg.Envs["OPENSANDBOX_RESUME_NOTICE_SECONDS"] = "25" - worker, client, err := registry.GetLeastLoadedWorker(session.Region) + worker, client, err := registry.GetLeastLoadedWorkerForPool(session.Region, WorkerPoolBurst) if err != nil { return true, fmt.Errorf("pick worker: %w", err) } diff --git a/internal/controlplane/redis_registry.go b/internal/controlplane/redis_registry.go index 6f9c5f82..d79f6b2b 100644 --- a/internal/controlplane/redis_registry.go +++ b/internal/controlplane/redis_registry.go @@ -54,6 +54,7 @@ const ( type WorkerEntry struct { ID string `json:"worker_id"` MachineID string `json:"machine_id,omitempty"` // EC2 instance ID + Pool string `json:"pool,omitempty"` Region string `json:"region"` GRPCAddr string `json:"grpc_addr"` HTTPAddr string `json:"http_addr"` @@ -74,6 +75,20 @@ type WorkerEntry struct { Sandboxes map[string]SandboxStats `json:"sandboxes,omitempty"` } +const ( + WorkerPoolOnDemand = "ondemand" + WorkerPoolBurst = "burst" +) + +func NormalizeWorkerPool(pool string) string { + switch pool { + case WorkerPoolBurst: + return WorkerPoolBurst + default: + return WorkerPoolOnDemand + } +} + // SandboxStats is the per-sandbox snapshot ingested from worker heartbeats. // Mirrors internal/worker.SandboxStatsWire — kept separate to avoid an // import cycle (CP shouldn't depend on the worker package). @@ -330,6 +345,7 @@ func (r *RedisWorkerRegistry) handleHeartbeat(entry WorkerEntry) { existing, ok := r.workers[entry.ID] if ok { // Update existing entry + existing.Pool = NormalizeWorkerPool(entry.Pool) existing.Current = entry.Current existing.Capacity = entry.Capacity existing.CPUPct = entry.CPUPct @@ -361,9 +377,10 @@ func (r *RedisWorkerRegistry) handleHeartbeat(entry WorkerEntry) { // on that ID so it's a cheap no-op; for rejoins, this is where we // re-issue Destroy for sandboxes the cell published "stopped" for // during the unreachable window. See internal/controlplane/reconcile.go. + entry.Pool = NormalizeWorkerPool(entry.Pool) entry.Draining = drainOverride r.workers[entry.ID] = &entry - log.Printf("redis_registry: new worker registered: %s (region=%s, grpc=%s, draining=%v)", entry.ID, entry.Region, entry.GRPCAddr, drainOverride) + log.Printf("redis_registry: new worker registered: %s (pool=%s, region=%s, grpc=%s, draining=%v)", entry.ID, entry.Pool, entry.Region, entry.GRPCAddr, drainOverride) if r.onWorkerRejoined != nil { // Fire in a goroutine — reconcile may take a few seconds (DB query // + a DestroySandbox RPC per stale entry) and we don't want @@ -490,18 +507,23 @@ func (r *RedisWorkerRegistry) dialWorkerLocked(workerID, grpcAddr string) { // score, since they're spikier and a transient burst shouldn't permanently // disprefer a worker. func (r *RedisWorkerRegistry) GetLeastLoadedWorker(region string) (*WorkerEntry, pb.SandboxWorkerClient, error) { + return r.GetLeastLoadedWorkerForPool(region, WorkerPoolOnDemand) +} + +func (r *RedisWorkerRegistry) GetLeastLoadedWorkerForPool(region string, pool string) (*WorkerEntry, pb.SandboxWorkerClient, error) { r.mu.Lock() defer r.mu.Unlock() + pool = NormalizeWorkerPool(pool) routingCtx, routingCancel := context.WithTimeout(context.Background(), time.Second) defer routingCancel() - eligible := r.collectEligibleLocked(region, false /* anyRegion */) + eligible := r.collectEligibleLocked(region, false /* anyRegion */, pool) if len(eligible) == 0 && region != "" { - eligible = r.collectEligibleLocked(region, true /* anyRegion */) + eligible = r.collectEligibleLocked(region, true /* anyRegion */, pool) } if len(eligible) == 0 { - return nil, nil, fmt.Errorf("no workers available") + return nil, nil, fmt.Errorf("no %s workers available", pool) } // Apply the cross-CP routing counter so an in-flight placement on the @@ -574,9 +596,12 @@ func (r *RedisWorkerRegistry) GetLeastLoadedWorker(region string) (*WorkerEntry, // collectEligibleLocked returns workers passing the routing eligibility gates. // If anyRegion is true the region filter is dropped (used for cross-region // fallback when a region is starved). Caller must hold r.mu. -func (r *RedisWorkerRegistry) collectEligibleLocked(region string, anyRegion bool) []*WorkerEntry { +func (r *RedisWorkerRegistry) collectEligibleLocked(region string, anyRegion bool, pool string) []*WorkerEntry { var out []*WorkerEntry for _, w := range r.workers { + if NormalizeWorkerPool(w.Pool) != pool { + continue + } if !anyRegion && region != "" && w.Region != region { continue } @@ -745,15 +770,26 @@ func (r *RedisWorkerRegistry) Regions() []string { // GetWorkersByRegion returns workers in a region (satisfies ScalerRegistry). func (r *RedisWorkerRegistry) GetWorkersByRegion(region string) []*WorkerInfo { + return r.GetWorkersByRegionAndPool(region, "") +} + +func (r *RedisWorkerRegistry) GetWorkersByRegionAndPool(region, pool string) []*WorkerInfo { r.mu.RLock() defer r.mu.RUnlock() + if pool != "" { + pool = NormalizeWorkerPool(pool) + } var result []*WorkerInfo for _, w := range r.workers { + if pool != "" && NormalizeWorkerPool(w.Pool) != pool { + continue + } if w.Region == region { result = append(result, &WorkerInfo{ ID: w.ID, MachineID: w.MachineID, + Pool: NormalizeWorkerPool(w.Pool), Region: w.Region, GRPCAddr: w.GRPCAddr, HTTPAddr: w.HTTPAddr, diff --git a/internal/controlplane/scaler.go b/internal/controlplane/scaler.go index a35dd8d4..c2e8c48f 100644 --- a/internal/controlplane/scaler.go +++ b/internal/controlplane/scaler.go @@ -90,6 +90,7 @@ type ScalerConfig struct { MinWorkers int // minimum total workers per region (0 = default 1). Always kept running. MaxWorkers int // maximum workers per region (0 = default 10). Hard cap to prevent runaway launches. IdleReserve int // target idle (0 sandbox) workers for burst absorption (0 = default 1). Separate from MinWorkers. + WorkerPool string // optional placement pool filter: ondemand or burst // Event emit for D1 sandboxes_index sync. After a scaler-triggered // migration succeeds (rolling replace, evacuation), XADD a "migrated" @@ -167,6 +168,7 @@ type Scaler struct { minWorkers int maxWorkers int idleReserve int + workerPool string rdb *redis.Client cellID string @@ -221,6 +223,7 @@ func NewScaler(cfg ScalerConfig) *Scaler { minWorkers: minWorkers, maxWorkers: maxWorkers, idleReserve: idleReserve, + workerPool: cfg.WorkerPool, machineSizes: cfg.MachineSizes, rdb: cfg.RedisClient, cellID: cfg.CellID, @@ -268,7 +271,7 @@ func (s *Scaler) Start() { } } }() - log.Printf("scaler: autoscaling controller started (interval=%s, cooldown=%s)", s.interval, s.cooldown) + log.Printf("scaler: autoscaling controller started (pool=%s, interval=%s, cooldown=%s)", NormalizeWorkerPool(s.workerPool), s.interval, s.cooldown) } // Stop stops the autoscaling loop. Can be called multiple times (idempotent). @@ -284,6 +287,51 @@ func (s *Scaler) Stop() { s.wg.Wait() } +func (s *Scaler) workersByRegion(region string) []*WorkerInfo { + workers := s.registry.GetWorkersByRegion(region) + if s.workerPool == "" { + return workers + } + pool := NormalizeWorkerPool(s.workerPool) + filtered := workers[:0] + for _, w := range workers { + if NormalizeWorkerPool(w.Pool) == pool { + filtered = append(filtered, w) + } + } + return filtered +} + +func (s *Scaler) regionUtilization(workers []*WorkerInfo) float64 { + if len(workers) == 0 { + return 0 + } + var totalCapacity, totalCurrent int + for _, w := range workers { + totalCapacity += w.Capacity + totalCurrent += w.Current + } + if totalCapacity == 0 { + return 0 + } + return float64(totalCurrent) / float64(totalCapacity) +} + +func (s *Scaler) regionResourcePressure(workers []*WorkerInfo) (maxCPU, maxMem, maxDisk float64) { + for _, w := range workers { + if w.CPUPct > maxCPU { + maxCPU = w.CPUPct + } + if w.MemPct > maxMem { + maxMem = w.MemPct + } + if w.DiskPct > maxDisk { + maxDisk = w.DiskPct + } + } + return maxCPU, maxMem, maxDisk +} + // EvacuateWorker starts the normal live-migration drain loop for a specific // worker without terminating the machine when the worker becomes empty. This is // intended for operator-triggered evacuation tests and spot-preemption drills. @@ -403,9 +451,9 @@ func (s *Scaler) evaluateRegion(ctx context.Context, region string) { s.mu.Lock() defer s.mu.Unlock() - workers := s.registry.GetWorkersByRegion(region) - utilization := s.registry.RegionUtilization(region) - maxCPU, maxMem, maxDisk := s.registry.RegionResourcePressure(region) + workers := s.workersByRegion(region) + utilization := s.regionUtilization(workers) + maxCPU, maxMem, maxDisk := s.regionResourcePressure(workers) // Expire stale pending launches s.expirePending(region) @@ -660,7 +708,7 @@ func (s *Scaler) expirePending(region string) { // Get currently registered worker machine IDs registered := make(map[string]bool) - for _, w := range s.registry.GetWorkersByRegion(region) { + for _, w := range s.workersByRegion(region) { if w.MachineID != "" { registered[w.MachineID] = true } @@ -852,7 +900,7 @@ func (s *Scaler) evacuateHotWorkers(_ context.Context, region string, workers [] // need 2x the workers of one at 50% actual — expensive dead weight for an // idle-heavy workload like sandboxes. func (s *Scaler) findMigrationTarget(region, excludeWorkerID string, requiredMemMB int32) *WorkerInfo { - workers := s.registry.GetWorkersByRegion(region) + workers := s.workersByRegion(region) var best *WorkerInfo bestScore := -1.0 @@ -1306,7 +1354,7 @@ func (s *Scaler) drainWorker(workerID, machineID, region, reason string) { // utilization-based scale-up path in Evaluate() won't trigger. if probe := s.findMigrationTarget(region, workerID, 0); probe == nil { effective := 0 - for _, w := range s.registry.GetWorkersByRegion(region) { + for _, w := range s.workersByRegion(region) { if !s.state.IsDraining(w.MachineID) { effective++ } @@ -1434,7 +1482,7 @@ func (s *Scaler) checkDrainingWorkers(ctx context.Context, region string) { } // Check if worker has 0 sandboxes - workers := s.registry.GetWorkersByRegion(region) + workers := s.workersByRegion(region) for _, w := range workers { if w.MachineID == machineID && w.Current == 0 { log.Printf("scaler: worker %s fully drained (0 sandboxes), destroying machine %s", diff --git a/internal/controlplane/worker_registry.go b/internal/controlplane/worker_registry.go index bb544894..495ccfb1 100644 --- a/internal/controlplane/worker_registry.go +++ b/internal/controlplane/worker_registry.go @@ -15,6 +15,7 @@ import ( type WorkerInfo struct { ID string `json:"worker_id"` MachineID string `json:"machine_id,omitempty"` // EC2 instance ID + Pool string `json:"pool,omitempty"` Region string `json:"region"` GRPCAddr string `json:"grpc_addr"` HTTPAddr string `json:"http_addr"` diff --git a/internal/db/store.go b/internal/db/store.go index 6c049c39..a1862bad 100644 --- a/internal/db/store.go +++ b/internal/db/store.go @@ -846,6 +846,8 @@ func (s *Store) MarkOrphanedSandboxes(ctx context.Context, liveWorkers map[strin `SELECT DISTINCT worker_id FROM sandbox_sessions WHERE status = 'running' AND NOT ( + COALESCE((config->>'burst')::boolean, false) + OR COALESCE((config->>'resumable')::boolean, false) OR config->>'sandboxFamily' = 'spot' OR config->>'sandboxFamily' = 'resumable' @@ -872,6 +874,8 @@ func (s *Store) MarkOrphanedSandboxes(ctx context.Context, liveWorkers map[strin `UPDATE sandbox_sessions SET status = 'error', error_msg = 'worker lost', stopped_at = now() WHERE worker_id = $1 AND status = 'running' AND NOT ( + COALESCE((config->>'burst')::boolean, false) + OR COALESCE((config->>'resumable')::boolean, false) OR config->>'sandboxFamily' = 'spot' OR config->>'sandboxFamily' = 'resumable' @@ -1461,6 +1465,8 @@ func (s *Store) ReconcileWorkerSessions(ctx context.Context, workerID string) (h `UPDATE sandbox_sessions SET status = 'hibernated' WHERE worker_id = $1 AND status = 'running' AND NOT ( + COALESCE((config->>'burst')::boolean, false) + OR COALESCE((config->>'resumable')::boolean, false) OR config->>'sandboxFamily' = 'spot' OR config->>'sandboxFamily' = 'resumable' @@ -1495,6 +1501,8 @@ func (s *Store) ReconcileWorkerSessions(ctx context.Context, workerID string) (h error_msg = 'worker restarted' WHERE worker_id = $1 AND status = 'running' AND NOT ( + COALESCE((config->>'burst')::boolean, false) + OR COALESCE((config->>'resumable')::boolean, false) OR config->>'sandboxFamily' = 'spot' OR config->>'sandboxFamily' = 'resumable' diff --git a/internal/worker/grpc_server.go b/internal/worker/grpc_server.go index 4a7a7b0e..d6e4840a 100644 --- a/internal/worker/grpc_server.go +++ b/internal/worker/grpc_server.go @@ -220,7 +220,8 @@ func (s *GRPCServer) CreateSandbox(ctx context.Context, req *pb.CreateSandboxReq SecretEnvs: req.SecretEnvs, DiskMB: int(req.DiskMb), } - if cfg.Envs["OPENSANDBOX_RESUMABLE"] == "true" { + if cfg.Envs["OPENSANDBOX_BURST"] == "true" || cfg.Envs["OPENSANDBOX_RESUMABLE"] == "true" { + cfg.Burst = true cfg.Resumable = true cfg.SandboxFamily = types.SandboxFamilySpot } diff --git a/internal/worker/redis_heartbeat.go b/internal/worker/redis_heartbeat.go index 82c22d60..0f9854a3 100644 --- a/internal/worker/redis_heartbeat.go +++ b/internal/worker/redis_heartbeat.go @@ -13,15 +13,16 @@ import ( // redisHeartbeatPayload is the JSON structure published to Redis. type redisHeartbeatPayload struct { - WorkerID string `json:"worker_id"` - MachineID string `json:"machine_id,omitempty"` // EC2 instance ID (e.g. i-099088f8ac4a34ef3) - Region string `json:"region"` - GRPCAddr string `json:"grpc_addr"` - HTTPAddr string `json:"http_addr"` - Capacity int `json:"capacity"` - Current int `json:"current"` - CPUPct float64 `json:"cpu_pct"` - MemPct float64 `json:"mem_pct"` + WorkerID string `json:"worker_id"` + MachineID string `json:"machine_id,omitempty"` // EC2 instance ID (e.g. i-099088f8ac4a34ef3) + Pool string `json:"pool,omitempty"` + Region string `json:"region"` + GRPCAddr string `json:"grpc_addr"` + HTTPAddr string `json:"http_addr"` + Capacity int `json:"capacity"` + Current int `json:"current"` + CPUPct float64 `json:"cpu_pct"` + MemPct float64 `json:"mem_pct"` DiskPct float64 `json:"disk_pct"` TotalMemoryMB int `json:"total_memory_mb,omitempty"` CommittedMemoryMB int `json:"committed_memory_mb,omitempty"` @@ -52,21 +53,22 @@ type SandboxStatsWire struct { // 1. SETs worker:{id} with a 30s TTL (auto-expires if worker dies) // 2. PUBLISHes to workers:heartbeat for real-time server notification type RedisHeartbeat struct { - rdb *redis.Client - workerID string - machineID string - region string - grpcAddr string - httpAddr string - getStats func() (capacity, current int, cpuPct, memPct, diskPct float64) - getMemoryInfo func() (totalMB, committedMB int) // optional: committed memory for dynamic capacity - getSandboxStats func() map[string]SandboxStatsWire // optional: per-sandbox stats for autoscaler - onReconnect func() // called when heartbeat succeeds after a previous failure - goldenVersion string - workerVersion string - wasDown bool // true if the last publish failed (used to detect reconnect) - stop chan struct{} - stopOnce sync.Once // guards close(stop) + rdb.Del — Stop() may be called from preemption handler and defer + rdb *redis.Client + workerID string + machineID string + pool string + region string + grpcAddr string + httpAddr string + getStats func() (capacity, current int, cpuPct, memPct, diskPct float64) + getMemoryInfo func() (totalMB, committedMB int) // optional: committed memory for dynamic capacity + getSandboxStats func() map[string]SandboxStatsWire // optional: per-sandbox stats for autoscaler + onReconnect func() // called when heartbeat succeeds after a previous failure + goldenVersion string + workerVersion string + wasDown bool // true if the last publish failed (used to detect reconnect) + stop chan struct{} + stopOnce sync.Once // guards close(stop) + rdb.Del — Stop() may be called from preemption handler and defer } // NewRedisHeartbeat creates a new heartbeat publisher. @@ -101,6 +103,12 @@ func NewRedisHeartbeat(redisURL, workerID, region, grpcAddr, httpAddr string) (* }, nil } +// SetPool sets the placement pool advertised by this worker. Empty defaults +// to ondemand on the control plane for backward compatibility. +func (h *RedisHeartbeat) SetPool(pool string) { + h.pool = pool +} + // SetMachineID sets the EC2 instance ID for the heartbeat (used by scaler for drain/terminate). func (h *RedisHeartbeat) SetMachineID(id string) { h.machineID = id @@ -161,15 +169,16 @@ func (h *RedisHeartbeat) publish() { capacity, current, cpuPct, memPct, diskPct := h.getStats() payload := redisHeartbeatPayload{ - WorkerID: h.workerID, - MachineID: h.machineID, - Region: h.region, - GRPCAddr: h.grpcAddr, - HTTPAddr: h.httpAddr, - Capacity: capacity, - Current: current, - CPUPct: cpuPct, - MemPct: memPct, + WorkerID: h.workerID, + MachineID: h.machineID, + Pool: h.pool, + Region: h.region, + GRPCAddr: h.grpcAddr, + HTTPAddr: h.httpAddr, + Capacity: capacity, + Current: current, + CPUPct: cpuPct, + MemPct: memPct, DiskPct: diskPct, GoldenVersion: h.goldenVersion, WorkerVersion: h.workerVersion, diff --git a/internal/worker/resumable_notice.go b/internal/worker/resumable_notice.go index 745edff9..05d98943 100644 --- a/internal/worker/resumable_notice.go +++ b/internal/worker/resumable_notice.go @@ -100,6 +100,7 @@ sync` Command: "/bin/sh", Args: []string{"-lc", restartNoticeScript}, Env: map[string]string{ + "OPENSANDBOX_BURST": "true", "OPENSANDBOX_RESUMABLE": "true", "OPENSANDBOX_RESUME_NOTICE_SECONDS": fmt.Sprintf("%d", noticeSeconds), }, diff --git a/pkg/types/sandbox.go b/pkg/types/sandbox.go index 8988ecb9..fcfb2908 100644 --- a/pkg/types/sandbox.go +++ b/pkg/types/sandbox.go @@ -34,7 +34,7 @@ type Sandbox struct { CpuCount int `json:"cpuCount"` MemoryMB int `json:"memoryMB"` SandboxFamily string `json:"sandboxFamily,omitempty"` - Resumable bool `json:"resumable,omitempty"` + Burst bool `json:"burst,omitempty"` MachineID string `json:"machineID,omitempty"` // ConnectURL and Token are currently unused by SDKs. All data-plane traffic // flows through the control plane's SandboxAPIProxy, which proxies to workers @@ -54,12 +54,16 @@ type SandboxConfig struct { MemoryMB int `json:"memoryMB,omitempty"` // default 256 DiskMB int `json:"diskMB,omitempty"` // workspace disk in MB (default 20480) // SandboxFamily selects an internal placement family. Empty is the default - // on-demand family. "spot" routes through resumable spare-capacity workers. + // on-demand family. "spot" routes through burst spare-capacity workers. SandboxFamily string `json:"sandboxFamily,omitempty"` - // Resumable selects the alpha lower-cost resumable sandbox tier. Resumable - // sandboxes preserve disk across infrastructure restarts but do not - // guarantee running process or memory survival. Internally this maps to the - // spot placement family while that tier is backed by spare cloud capacity. + // Burst selects the alpha lower-cost Burst Sandbox tier. Burst Sandboxes + // preserve disk across infrastructure restarts but do not guarantee running + // process or memory survival. Internally this maps to the spot placement + // family while that tier is backed by spare cloud capacity. + Burst bool `json:"burst,omitempty"` + // Resumable is the legacy public field name for Burst Sandboxes. Keep + // accepting it for existing callers and rows, but prefer Burst for new API + // requests/responses. Resumable bool `json:"resumable,omitempty"` Envs map[string]string `json:"envs,omitempty"` Port int `json:"port,omitempty"` // container port to expose via subdomain (default 80) @@ -206,7 +210,7 @@ func ValidateResourceTier(cfg *SandboxConfig) error { // ApplySandboxFamilyDefaultsAndValidate normalizes alpha sandbox-family options // before regular resource-tier validation. func ApplySandboxFamilyDefaultsAndValidate(cfg *SandboxConfig) error { - if cfg.Resumable { + if cfg.Burst || cfg.Resumable { if cfg.SandboxFamily == "" || cfg.SandboxFamily == "default" { cfg.SandboxFamily = SandboxFamilySpot } @@ -217,12 +221,16 @@ func ApplySandboxFamilyDefaultsAndValidate(cfg *SandboxConfig) error { case "default": cfg.SandboxFamily = SandboxFamilyDefault return nil + case "burst": + cfg.SandboxFamily = SandboxFamilySpot + cfg.Burst = true + return ApplySandboxFamilyDefaultsAndValidate(cfg) case "resumable": cfg.SandboxFamily = SandboxFamilySpot - cfg.Resumable = true + cfg.Burst = true return ApplySandboxFamilyDefaultsAndValidate(cfg) case SandboxFamilySpot: - cfg.Resumable = true + cfg.Burst = true return nil default: return fmt.Errorf("unsupported sandboxFamily %q", cfg.SandboxFamily) @@ -234,7 +242,7 @@ func (c SandboxConfig) IsSpotFamily() bool { } func (c SandboxConfig) IsResumable() bool { - return c.Resumable || c.SandboxFamily == SandboxFamilySpot + return c.Burst || c.Resumable || c.SandboxFamily == SandboxFamilySpot } // SandboxListResponse is the response for listing sandboxes. diff --git a/pkg/types/sandbox_family_test.go b/pkg/types/sandbox_family_test.go index 64103f03..ff87870d 100644 --- a/pkg/types/sandbox_family_test.go +++ b/pkg/types/sandbox_family_test.go @@ -5,18 +5,32 @@ import ( "testing" ) -func TestApplySandboxFamilyDefaultsAndValidateSpotMarksResumable(t *testing.T) { +func TestApplySandboxFamilyDefaultsAndValidateSpotMarksBurst(t *testing.T) { cfg := SandboxConfig{SandboxFamily: SandboxFamilySpot} if err := ApplySandboxFamilyDefaultsAndValidate(&cfg); err != nil { t.Fatalf("unexpected error: %v", err) } - if !cfg.Resumable { - t.Fatalf("expected internal spot family to mark sandbox resumable") + if !cfg.Burst { + t.Fatalf("expected internal spot family to mark sandbox burst") } } -func TestApplySandboxFamilyDefaultsAndValidateResumableFlagMapsToSpot(t *testing.T) { +func TestApplySandboxFamilyDefaultsAndValidateBurstFlagMapsToSpot(t *testing.T) { + cfg := SandboxConfig{Burst: true} + + if err := ApplySandboxFamilyDefaultsAndValidate(&cfg); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if cfg.SandboxFamily != SandboxFamilySpot { + t.Fatalf("expected burst to map to internal spot family, got %q", cfg.SandboxFamily) + } + if cfg.CpuCount != 0 || cfg.MemoryMB != 0 { + t.Fatalf("expected burst not to force resources, got cpu=%d memory=%d", cfg.CpuCount, cfg.MemoryMB) + } +} + +func TestApplySandboxFamilyDefaultsAndValidateLegacyResumableFlagMapsToSpot(t *testing.T) { cfg := SandboxConfig{Resumable: true} if err := ApplySandboxFamilyDefaultsAndValidate(&cfg); err != nil { @@ -30,25 +44,36 @@ func TestApplySandboxFamilyDefaultsAndValidateResumableFlagMapsToSpot(t *testing } } -func TestApplySandboxFamilyDefaultsAndValidateResumableFamilyAlias(t *testing.T) { +func TestApplySandboxFamilyDefaultsAndValidateBurstFamilyAlias(t *testing.T) { + cfg := SandboxConfig{SandboxFamily: "burst"} + + if err := ApplySandboxFamilyDefaultsAndValidate(&cfg); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if cfg.SandboxFamily != SandboxFamilySpot || !cfg.Burst { + t.Fatalf("expected burst alias to map to internal spot family, got family=%q burst=%v", cfg.SandboxFamily, cfg.Burst) + } +} + +func TestApplySandboxFamilyDefaultsAndValidateLegacyResumableFamilyAlias(t *testing.T) { cfg := SandboxConfig{SandboxFamily: "resumable"} if err := ApplySandboxFamilyDefaultsAndValidate(&cfg); err != nil { t.Fatalf("unexpected error: %v", err) } - if cfg.SandboxFamily != SandboxFamilySpot || !cfg.Resumable { - t.Fatalf("expected resumable alias to map to internal spot family, got family=%q resumable=%v", cfg.SandboxFamily, cfg.Resumable) + if cfg.SandboxFamily != SandboxFamilySpot || !cfg.Burst { + t.Fatalf("expected legacy resumable alias to map to internal spot family, got family=%q burst=%v", cfg.SandboxFamily, cfg.Burst) } } -func TestApplySandboxFamilyDefaultsAndValidateResumableAllowsLargerTier(t *testing.T) { +func TestApplySandboxFamilyDefaultsAndValidateBurstAllowsLargerTier(t *testing.T) { cfg := SandboxConfig{SandboxFamily: SandboxFamilySpot, CpuCount: 1, MemoryMB: 4096} if err := ApplySandboxFamilyDefaultsAndValidate(&cfg); err != nil { t.Fatalf("unexpected error: %v", err) } if err := ValidateResourceTier(&cfg); err != nil { - t.Fatalf("expected larger resumable tier to validate normally: %v", err) + t.Fatalf("expected larger burst tier to validate normally: %v", err) } } diff --git a/sdks/python/opencomputer/sandbox.py b/sdks/python/opencomputer/sandbox.py index 4f7f35de..f4b1485e 100644 --- a/sdks/python/opencomputer/sandbox.py +++ b/sdks/python/opencomputer/sandbox.py @@ -85,6 +85,7 @@ async def create( api_url: str | None = None, envs: dict[str, str] | None = None, metadata: dict[str, str] | None = None, + burst: bool | None = None, resumable: bool | None = None, sandbox_family: str | None = None, disk_mb: int | None = None, @@ -103,10 +104,11 @@ async def create( api_url: API URL (or OPENCOMPUTER_API_URL env var). envs: Environment variables to inject. Overrides store secrets. metadata: Custom metadata key-value pairs. - resumable: Create a resumable sandbox. Disk is preserved across + burst: Create a Burst Sandbox. Disk is preserved across infrastructure restarts; processes may restart. + resumable: Deprecated alias for ``burst``. sandbox_family: Internal/legacy placement family. Prefer - ``resumable=True`` for public API usage. + ``burst=True`` for public API usage. disk_mb: Workspace disk size in MB (default 20480 = 20GB). Any additional GB above 20GB is metered at a per-second rate comparable to EBS gp3. Closed beta: requests above 20GB @@ -150,7 +152,9 @@ async def create( body["envs"] = envs if metadata: body["metadata"] = metadata - if resumable is not None: + if burst is not None: + body["burst"] = burst + elif resumable is not None: body["resumable"] = resumable if sandbox_family: body["sandboxFamily"] = sandbox_family diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml index d2498465..f77cd463 100644 --- a/sdks/python/pyproject.toml +++ b/sdks/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "opencomputer-sdk" -version = "0.6.0" +version = "0.6.1" description = "Python SDK for OpenComputer - cloud sandbox platform" readme = "README.md" requires-python = ">=3.10" diff --git a/sdks/typescript/package-lock.json b/sdks/typescript/package-lock.json index 6cf6c844..e54228f2 100644 --- a/sdks/typescript/package-lock.json +++ b/sdks/typescript/package-lock.json @@ -1,12 +1,12 @@ { "name": "@opencomputer/sdk", - "version": "0.6.0", + "version": "0.6.1", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@opencomputer/sdk", - "version": "0.6.0", + "version": "0.6.1", "license": "MIT", "devDependencies": { "@types/node": "^25.3.0", diff --git a/sdks/typescript/package.json b/sdks/typescript/package.json index 9f55155b..2caa0f8f 100644 --- a/sdks/typescript/package.json +++ b/sdks/typescript/package.json @@ -1,6 +1,6 @@ { "name": "@opencomputer/sdk", - "version": "0.6.0", + "version": "0.6.1", "description": "TypeScript SDK for OpenComputer - cloud sandbox platform", "type": "module", "main": "dist/index.js", diff --git a/sdks/typescript/src/sandbox.ts b/sdks/typescript/src/sandbox.ts index c627fb7b..c6918e55 100644 --- a/sdks/typescript/src/sandbox.ts +++ b/sdks/typescript/src/sandbox.ts @@ -13,9 +13,11 @@ function resolveApiUrl(url: string): string { export interface SandboxOpts { template?: string; - /** Create a resumable sandbox. Disk is preserved across infrastructure restarts; processes may restart. */ + /** Create a Burst Sandbox. Disk is preserved across infrastructure restarts; processes may restart. */ + burst?: boolean; + /** @deprecated Use `burst` instead. */ resumable?: boolean; - /** Internal/legacy placement family. Prefer `resumable: true` for public API usage. */ + /** Internal/legacy placement family. Prefer `burst: true` for public API usage. */ sandboxFamily?: "spot"; /** * Idle timeout in seconds after which the sandbox auto-hibernates. @@ -51,6 +53,7 @@ interface SandboxData { status: string; templateID?: string; sandboxFamily?: string; + burst?: boolean; resumable?: boolean; connectURL?: string; token?: string; @@ -266,6 +269,7 @@ export class Sandbox { }; if (opts.envs) body.envs = opts.envs; if (opts.metadata) body.metadata = opts.metadata; + if (opts.burst != null) body.burst = opts.burst; if (opts.resumable != null) body.resumable = opts.resumable; if (opts.sandboxFamily) body.sandboxFamily = opts.sandboxFamily; if (opts.cpuCount != null) body.cpuCount = opts.cpuCount; From 0310f6e152b3f3ab7bb4de18c99de4d047c702a5 Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Fri, 5 Jun 2026 11:04:57 -0700 Subject: [PATCH 21/32] Use EC2 AMI for AWS scaler launches --- cmd/server/main.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cmd/server/main.go b/cmd/server/main.go index 34867813..8437a9a3 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -485,11 +485,14 @@ func main() { // Pick the per-provider ranked size list. Empty → scaler defers to // the pool's single configured default (cfg.AzureVMSize / cfg.EC2InstanceType). var machineSizes []string + var workerImage string switch { case len(cfg.AzureVMSizes) > 0 && cfg.AzureSubscriptionID != "": machineSizes = cfg.AzureVMSizes + workerImage = cfg.AzureImageID case len(cfg.EC2InstanceTypes) > 0 && (cfg.EC2AMI != "" || cfg.EC2SSMParameterName != ""): machineSizes = cfg.EC2InstanceTypes + workerImage = cfg.EC2AMI } if len(machineSizes) > 0 { log.Printf("opensandbox: scaler size fallback ranked: %v", machineSizes) @@ -501,7 +504,7 @@ func main() { Registry: redisRegistry, Store: opts.Store, StateStore: scalerState, - WorkerImage: cfg.EC2WorkerImage, + WorkerImage: workerImage, Cooldown: time.Duration(cfg.ScaleCooldownSec) * time.Second, MinWorkers: cfg.MinWorkersPerRegion, MaxWorkers: cfg.MaxWorkersPerRegion, From 6c9c4eb861772d226a7ee025d2b5f1032cfd6ae5 Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Fri, 5 Jun 2026 14:36:34 -0700 Subject: [PATCH 22/32] docs: simplify sandbox sizing references --- docs/api-reference/sandboxes/create.mdx | 17 +++-------------- docs/reference/cli/scaling.mdx | 6 +++--- docs/reference/python-sdk/scaling.mdx | 8 ++++---- docs/reference/typescript-sdk.mdx | 4 ++-- docs/reference/typescript-sdk/scaling.mdx | 8 ++++---- docs/sandboxes/templates.mdx | 2 +- 6 files changed, 17 insertions(+), 28 deletions(-) diff --git a/docs/api-reference/sandboxes/create.mdx b/docs/api-reference/sandboxes/create.mdx index 906da964..57ee47f3 100644 --- a/docs/api-reference/sandboxes/create.mdx +++ b/docs/api-reference/sandboxes/create.mdx @@ -14,31 +14,20 @@ Create a new sandbox. - CPU cores. Must match an allowed tier: `1`, `2`, `4`, `8`, or `16`. If omitted but `memoryMB` is set, inferred automatically. + CPU cores. If omitted but `memoryMB` is set, inferred automatically. - Memory in MB. Must match an allowed tier: `1024`, `4096`, `8192`, `16384`, `32768`, or `65536`. If omitted but `cpuCount` is set, inferred automatically. + Memory in MB. If omitted but `cpuCount` is set, inferred automatically. Create a [Burst Sandbox](/sandboxes/burst-sandboxes). Disk is preserved across infrastructure restarts; processes may restart. -The allowed CPU/memory combinations are: - -| Memory | vCPU | -| --- | --- | -| 1024 MB (1 GB) | 1 | -| 4096 MB (4 GB) | 1 | -| 8192 MB (8 GB) | 2 | -| 16384 MB (16 GB) | 4 | -| 32768 MB (32 GB) | 8 | -| 65536 MB (64 GB) | 16 | - The 1 GB tier provides 1 vCPU on a best-effort basis. For guaranteed CPU allocation, use the 4 GB tier or above. -If both `cpuCount` and `memoryMB` are provided, they must match one of these tiers. +If both `cpuCount` and `memoryMB` are provided, they must match a platform tier. Environment variables as key-value pairs diff --git a/docs/reference/cli/scaling.mdx b/docs/reference/cli/scaling.mdx index 4a6b200d..62f42815 100644 --- a/docs/reference/cli/scaling.mdx +++ b/docs/reference/cli/scaling.mdx @@ -11,7 +11,7 @@ The CLI groups three related actions for sizing a sandbox: | [`oc sandbox scale`](#oc-sandbox-scale-id-memory-mb) | Manually resize once | | [`oc sandbox lock`](#oc-sandbox-lock-id) / [`unlock`](#oc-sandbox-unlock-id) / [`lock-status`](#oc-sandbox-lock-status-id) | Freeze or unfreeze the current size | -Memory tiers are fixed: `1024`, `4096`, `8192`, `16384`, `32768`, `65536` MB. CPU follows memory per the platform's tier table (e.g. 8 GB → 4 vCPU); you don't pick CPU separately. +CPU follows memory per the platform's tier table; you don't pick CPU separately. For the underlying concepts and how these three modes interact, see the [Elasticity](/sandboxes/elasticity) guide. @@ -53,8 +53,8 @@ The asymmetry is deliberate: rapid response when the user notices lag, conservat | --- | --- | | `--on` | Enable autoscale (requires `--min` and `--max`). | | `--off` | Disable autoscale. Mutually exclusive with `--on`. | -| `--min N` | Minimum memory tier in MB. Must be an allowed tier. | -| `--max N` | Maximum memory tier in MB. Must be an allowed tier and ≥ `--min`. | +| `--min N` | Minimum memory in MB. | +| `--max N` | Maximum memory in MB. Must be ≥ `--min`. | **Errors** diff --git a/docs/reference/python-sdk/scaling.mdx b/docs/reference/python-sdk/scaling.mdx index 214f16c9..b8138a6f 100644 --- a/docs/reference/python-sdk/scaling.mdx +++ b/docs/reference/python-sdk/scaling.mdx @@ -11,7 +11,7 @@ OpenComputer sandboxes can change size at runtime. There are three knobs: | Track memory pressure automatically | [`set_autoscale()`](#sandbox-set-autoscale) | | Freeze the current size | [`set_scaling_lock()`](#sandbox-set-scaling-lock-locked) | -Memory tiers are fixed: `1024`, `4096`, `8192`, `16384`, `32768`, `65536` MB. CPU follows memory per the platform's tier table (e.g. 8 GB → 4 vCPU). You don't pick CPU separately. +CPU follows memory per the platform's tier table. You don't pick CPU separately. ## How the three interact @@ -26,7 +26,7 @@ Memory tiers are fixed: `1024`, `4096`, `8192`, `16384`, `32768`, `65536` MB. CP Manually resize the sandbox. [HTTP API →](/api-reference/sandboxes/scale) - Target memory tier in MB. Must be one of the allowed tiers. + Target memory in MB. **Returns:** `dict` with `sandboxID`, `memoryMB`, `cpuPercent`. @@ -61,11 +61,11 @@ Enable or disable per-sandbox autoscale. - Lower bound when `enabled=True`. Must be an allowed tier. + Lower bound when `enabled=True`. - Upper bound when `enabled=True`. Must be an allowed tier and ≥ `min_memory_mb`. + Upper bound when `enabled=True`. Must be ≥ `min_memory_mb`. **Returns:** `dict` with `sandboxID`, `enabled`, `minMemoryMB`, `maxMemoryMB`. diff --git a/docs/reference/typescript-sdk.mdx b/docs/reference/typescript-sdk.mdx index 754fc8df..e5c2584a 100644 --- a/docs/reference/typescript-sdk.mdx +++ b/docs/reference/typescript-sdk.mdx @@ -33,8 +33,8 @@ Create a new sandbox. | `apiUrl` | string | env var | API URL | | `envs` | Record\ | — | Environment variables | | `metadata` | Record\ | — | Arbitrary metadata | -| `cpuCount` | number | — | CPU cores (1, 2, or 4) | -| `memoryMB` | number | — | Memory in MB (1024, 4096, 8192, 16384, 32768, or 65536) | +| `cpuCount` | number | — | CPU cores | +| `memoryMB` | number | — | Memory in MB | | `image` | Image | — | Declarative image definition (see [Image](#image)) | | `snapshot` | string | — | Name of a pre-built snapshot | | `onBuildLog` | `(log: string) => void` | — | Build log callback (when using `image`) | diff --git a/docs/reference/typescript-sdk/scaling.mdx b/docs/reference/typescript-sdk/scaling.mdx index 4ca44ae5..0b01de58 100644 --- a/docs/reference/typescript-sdk/scaling.mdx +++ b/docs/reference/typescript-sdk/scaling.mdx @@ -11,7 +11,7 @@ OpenComputer sandboxes can change size at runtime. There are three knobs: | Track memory pressure automatically | [`setAutoscale()`](#sandbox-setautoscale-opts) | | Freeze the current size | [`setScalingLock()`](#sandbox-setscalinglock-locked) | -Memory tiers are fixed: `1024`, `4096`, `8192`, `16384`, `32768`, `65536` MB. CPU follows memory per the platform's tier table (e.g. 8 GB → 4 vCPU). You don't pick CPU separately. +CPU follows memory per the platform's tier table. You don't pick CPU separately. ## How the three interact @@ -26,7 +26,7 @@ Memory tiers are fixed: `1024`, `4096`, `8192`, `16384`, `32768`, `65536` MB. CP Manually resize the sandbox. [HTTP API →](/api-reference/sandboxes/scale) - Target memory tier in MB. Must be one of the allowed tiers. + Target memory in MB. **Returns:** `Promise<{ sandboxID: string; memoryMB: number; cpuPercent: number }>` @@ -66,11 +66,11 @@ Enable or disable per-sandbox autoscale. - Lower bound when `enabled=true`. Must be an allowed tier. + Lower bound when `enabled=true`. - Upper bound when `enabled=true`. Must be an allowed tier and ≥ `minMemoryMB`. + Upper bound when `enabled=true`. Must be ≥ `minMemoryMB`. **Returns:** `Promise<{ sandboxID: string; enabled: boolean; minMemoryMB: number; maxMemoryMB: number }>` diff --git a/docs/sandboxes/templates.mdx b/docs/sandboxes/templates.mdx index ce17a574..67fea5f7 100644 --- a/docs/sandboxes/templates.mdx +++ b/docs/sandboxes/templates.mdx @@ -195,7 +195,7 @@ curl -X POST https://app.opencomputer.dev/api/sandboxes \ `memoryMB` is clamped to a valid range, and the response's `memoryMB` reports the effective value: - **Floor — the snapshot's own memory.** A smaller request is ignored; a fork can't start smaller than the snapshot it restores. A 4 GB snapshot forked with `memoryMB: 1024` still boots at ~4 GB. -- **Ceiling — 16 GB.** Larger requests are capped (`32768` → `16384`). +- **Ceiling.** Larger requests are capped to the maximum platform tier. The same `memoryMB` field works when [forking from a checkpoint](/api-reference/checkpoints/fork). From fe282a6806365bdeb7d5045ed59a5a7e5351e200 Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Fri, 5 Jun 2026 14:41:26 -0700 Subject: [PATCH 23/32] Revert "docs: simplify sandbox sizing references" This reverts commit 6c9c4eb861772d226a7ee025d2b5f1032cfd6ae5. --- docs/api-reference/sandboxes/create.mdx | 17 ++++++++++++++--- docs/reference/cli/scaling.mdx | 6 +++--- docs/reference/python-sdk/scaling.mdx | 8 ++++---- docs/reference/typescript-sdk.mdx | 4 ++-- docs/reference/typescript-sdk/scaling.mdx | 8 ++++---- docs/sandboxes/templates.mdx | 2 +- 6 files changed, 28 insertions(+), 17 deletions(-) diff --git a/docs/api-reference/sandboxes/create.mdx b/docs/api-reference/sandboxes/create.mdx index 57ee47f3..906da964 100644 --- a/docs/api-reference/sandboxes/create.mdx +++ b/docs/api-reference/sandboxes/create.mdx @@ -14,20 +14,31 @@ Create a new sandbox. - CPU cores. If omitted but `memoryMB` is set, inferred automatically. + CPU cores. Must match an allowed tier: `1`, `2`, `4`, `8`, or `16`. If omitted but `memoryMB` is set, inferred automatically. - Memory in MB. If omitted but `cpuCount` is set, inferred automatically. + Memory in MB. Must match an allowed tier: `1024`, `4096`, `8192`, `16384`, `32768`, or `65536`. If omitted but `cpuCount` is set, inferred automatically. Create a [Burst Sandbox](/sandboxes/burst-sandboxes). Disk is preserved across infrastructure restarts; processes may restart. +The allowed CPU/memory combinations are: + +| Memory | vCPU | +| --- | --- | +| 1024 MB (1 GB) | 1 | +| 4096 MB (4 GB) | 1 | +| 8192 MB (8 GB) | 2 | +| 16384 MB (16 GB) | 4 | +| 32768 MB (32 GB) | 8 | +| 65536 MB (64 GB) | 16 | + The 1 GB tier provides 1 vCPU on a best-effort basis. For guaranteed CPU allocation, use the 4 GB tier or above. -If both `cpuCount` and `memoryMB` are provided, they must match a platform tier. +If both `cpuCount` and `memoryMB` are provided, they must match one of these tiers. Environment variables as key-value pairs diff --git a/docs/reference/cli/scaling.mdx b/docs/reference/cli/scaling.mdx index 62f42815..4a6b200d 100644 --- a/docs/reference/cli/scaling.mdx +++ b/docs/reference/cli/scaling.mdx @@ -11,7 +11,7 @@ The CLI groups three related actions for sizing a sandbox: | [`oc sandbox scale`](#oc-sandbox-scale-id-memory-mb) | Manually resize once | | [`oc sandbox lock`](#oc-sandbox-lock-id) / [`unlock`](#oc-sandbox-unlock-id) / [`lock-status`](#oc-sandbox-lock-status-id) | Freeze or unfreeze the current size | -CPU follows memory per the platform's tier table; you don't pick CPU separately. +Memory tiers are fixed: `1024`, `4096`, `8192`, `16384`, `32768`, `65536` MB. CPU follows memory per the platform's tier table (e.g. 8 GB → 4 vCPU); you don't pick CPU separately. For the underlying concepts and how these three modes interact, see the [Elasticity](/sandboxes/elasticity) guide. @@ -53,8 +53,8 @@ The asymmetry is deliberate: rapid response when the user notices lag, conservat | --- | --- | | `--on` | Enable autoscale (requires `--min` and `--max`). | | `--off` | Disable autoscale. Mutually exclusive with `--on`. | -| `--min N` | Minimum memory in MB. | -| `--max N` | Maximum memory in MB. Must be ≥ `--min`. | +| `--min N` | Minimum memory tier in MB. Must be an allowed tier. | +| `--max N` | Maximum memory tier in MB. Must be an allowed tier and ≥ `--min`. | **Errors** diff --git a/docs/reference/python-sdk/scaling.mdx b/docs/reference/python-sdk/scaling.mdx index b8138a6f..214f16c9 100644 --- a/docs/reference/python-sdk/scaling.mdx +++ b/docs/reference/python-sdk/scaling.mdx @@ -11,7 +11,7 @@ OpenComputer sandboxes can change size at runtime. There are three knobs: | Track memory pressure automatically | [`set_autoscale()`](#sandbox-set-autoscale) | | Freeze the current size | [`set_scaling_lock()`](#sandbox-set-scaling-lock-locked) | -CPU follows memory per the platform's tier table. You don't pick CPU separately. +Memory tiers are fixed: `1024`, `4096`, `8192`, `16384`, `32768`, `65536` MB. CPU follows memory per the platform's tier table (e.g. 8 GB → 4 vCPU). You don't pick CPU separately. ## How the three interact @@ -26,7 +26,7 @@ CPU follows memory per the platform's tier table. You don't pick CPU separately. Manually resize the sandbox. [HTTP API →](/api-reference/sandboxes/scale) - Target memory in MB. + Target memory tier in MB. Must be one of the allowed tiers. **Returns:** `dict` with `sandboxID`, `memoryMB`, `cpuPercent`. @@ -61,11 +61,11 @@ Enable or disable per-sandbox autoscale. - Lower bound when `enabled=True`. + Lower bound when `enabled=True`. Must be an allowed tier. - Upper bound when `enabled=True`. Must be ≥ `min_memory_mb`. + Upper bound when `enabled=True`. Must be an allowed tier and ≥ `min_memory_mb`. **Returns:** `dict` with `sandboxID`, `enabled`, `minMemoryMB`, `maxMemoryMB`. diff --git a/docs/reference/typescript-sdk.mdx b/docs/reference/typescript-sdk.mdx index e5c2584a..754fc8df 100644 --- a/docs/reference/typescript-sdk.mdx +++ b/docs/reference/typescript-sdk.mdx @@ -33,8 +33,8 @@ Create a new sandbox. | `apiUrl` | string | env var | API URL | | `envs` | Record\ | — | Environment variables | | `metadata` | Record\ | — | Arbitrary metadata | -| `cpuCount` | number | — | CPU cores | -| `memoryMB` | number | — | Memory in MB | +| `cpuCount` | number | — | CPU cores (1, 2, or 4) | +| `memoryMB` | number | — | Memory in MB (1024, 4096, 8192, 16384, 32768, or 65536) | | `image` | Image | — | Declarative image definition (see [Image](#image)) | | `snapshot` | string | — | Name of a pre-built snapshot | | `onBuildLog` | `(log: string) => void` | — | Build log callback (when using `image`) | diff --git a/docs/reference/typescript-sdk/scaling.mdx b/docs/reference/typescript-sdk/scaling.mdx index 0b01de58..4ca44ae5 100644 --- a/docs/reference/typescript-sdk/scaling.mdx +++ b/docs/reference/typescript-sdk/scaling.mdx @@ -11,7 +11,7 @@ OpenComputer sandboxes can change size at runtime. There are three knobs: | Track memory pressure automatically | [`setAutoscale()`](#sandbox-setautoscale-opts) | | Freeze the current size | [`setScalingLock()`](#sandbox-setscalinglock-locked) | -CPU follows memory per the platform's tier table. You don't pick CPU separately. +Memory tiers are fixed: `1024`, `4096`, `8192`, `16384`, `32768`, `65536` MB. CPU follows memory per the platform's tier table (e.g. 8 GB → 4 vCPU). You don't pick CPU separately. ## How the three interact @@ -26,7 +26,7 @@ CPU follows memory per the platform's tier table. You don't pick CPU separately. Manually resize the sandbox. [HTTP API →](/api-reference/sandboxes/scale) - Target memory in MB. + Target memory tier in MB. Must be one of the allowed tiers. **Returns:** `Promise<{ sandboxID: string; memoryMB: number; cpuPercent: number }>` @@ -66,11 +66,11 @@ Enable or disable per-sandbox autoscale.
- Lower bound when `enabled=true`. + Lower bound when `enabled=true`. Must be an allowed tier. - Upper bound when `enabled=true`. Must be ≥ `minMemoryMB`. + Upper bound when `enabled=true`. Must be an allowed tier and ≥ `minMemoryMB`. **Returns:** `Promise<{ sandboxID: string; enabled: boolean; minMemoryMB: number; maxMemoryMB: number }>` diff --git a/docs/sandboxes/templates.mdx b/docs/sandboxes/templates.mdx index 67fea5f7..ce17a574 100644 --- a/docs/sandboxes/templates.mdx +++ b/docs/sandboxes/templates.mdx @@ -195,7 +195,7 @@ curl -X POST https://app.opencomputer.dev/api/sandboxes \ `memoryMB` is clamped to a valid range, and the response's `memoryMB` reports the effective value: - **Floor — the snapshot's own memory.** A smaller request is ignored; a fork can't start smaller than the snapshot it restores. A 4 GB snapshot forked with `memoryMB: 1024` still boots at ~4 GB. -- **Ceiling.** Larger requests are capped to the maximum platform tier. +- **Ceiling — 16 GB.** Larger requests are capped (`32768` → `16384`). The same `memoryMB` field works when [forking from a checkpoint](/api-reference/checkpoints/fork). From 94ffe658a66fc4b78460a7b4247ff4d404f4631d Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Fri, 5 Jun 2026 16:08:54 -0700 Subject: [PATCH 24/32] Add AWS worker AMI build workflow --- .github/workflows/build-aws-worker-ami.yml | 188 +++++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 .github/workflows/build-aws-worker-ami.yml diff --git a/.github/workflows/build-aws-worker-ami.yml b/.github/workflows/build-aws-worker-ami.yml new file mode 100644 index 00000000..97096fd3 --- /dev/null +++ b/.github/workflows/build-aws-worker-ami.yml @@ -0,0 +1,188 @@ +name: Build AWS Worker AMI + +on: + workflow_dispatch: + inputs: + region: + description: AWS region to build the AMI in + required: true + default: us-east-2 + builder_instance_type: + description: EC2 instance type used by Packer for the build + required: true + default: c5.4xlarge + cleanup_old_amis: + description: Deregister old OpenComputer AWS worker AMIs after a successful build + required: true + default: "false" + type: choice + options: + - "false" + - "true" + ssm_parameter_prefix: + description: SSM prefix to update after publishing the AMI + required: true + default: /opencomputer/aws-us-east-2-burst-prod + +env: + AWS_REGION: ${{ inputs.region || vars.AWS_REGION || 'us-east-2' }} + BUILDER_INSTANCE_TYPE: ${{ inputs.builder_instance_type || 'c5.4xlarge' }} + SSM_PARAMETER_PREFIX: ${{ inputs.ssm_parameter_prefix || '/opencomputer/aws-us-east-2-burst-prod' }} + +jobs: + build-ami: + name: Build AWS Worker AMI + runs-on: ubuntu-latest + environment: aws-prod + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - uses: actions/setup-go@v5 + with: + go-version: "1.23" + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_PROD_ROLE_ARN }} + aws-region: ${{ env.AWS_REGION }} + + - name: Setup Packer + uses: hashicorp/setup-packer@main + + - name: Build binaries (amd64) + run: | + VERSION=$(git rev-parse --short HEAD) + echo "VERSION=$VERSION" >> "$GITHUB_ENV" + + AGENT_VERSION=$(git log -1 --pretty=format:%h -- cmd/agent internal/agent proto/agent) + if [ -z "$AGENT_VERSION" ]; then + AGENT_VERSION=$VERSION + fi + echo "AGENT_VERSION=$AGENT_VERSION" >> "$GITHUB_ENV" + + CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \ + -ldflags "-X main.WorkerVersion=$VERSION -X main.AgentVersion=$AGENT_VERSION" \ + -o bin/opensandbox-worker ./cmd/worker/ + + CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \ + -ldflags "-X main.Version=$AGENT_VERSION" \ + -o bin/osb-agent ./cmd/agent/ + + - name: Package rootfs context + run: | + tar czf /tmp/packer-rootfs-ctx.tar.gz \ + deploy/firecracker/rootfs/ \ + deploy/ec2/build-rootfs-docker.sh \ + scripts/claude-agent-wrapper/ + + - name: Package Vector configs + run: tar czf /tmp/packer-vector-ctx.tar.gz -C deploy vector + + - name: Packer init + run: packer init deploy/packer/worker-ami-aws.pkr.hcl + + - name: Build and publish AMI + run: | + packer build \ + -var "worker_version=$VERSION" \ + -var "agent_version=$AGENT_VERSION" \ + -var "region=$AWS_REGION" \ + -var "instance_type=$BUILDER_INSTANCE_TYPE" \ + deploy/packer/worker-ami-aws.pkr.hcl | tee /tmp/packer-output.txt + + - name: Read AMI manifest + id: ami + run: | + AMI_ID=$(jq -r '.builds[-1].artifact_id | split(":")[-1]' packer-manifest-aws.json) + if [ -z "$AMI_ID" ] || [ "$AMI_ID" = "null" ]; then + echo "Could not read AMI ID from packer-manifest-aws.json" + cat packer-manifest-aws.json + exit 1 + fi + + GOLDEN_VERSION=$(grep -a 'Golden version:' /tmp/packer-output.txt | tail -1 | awk '{print $NF}' | tr -d '\r') + + echo "ami_id=$AMI_ID" >> "$GITHUB_OUTPUT" + echo "golden_version=$GOLDEN_VERSION" >> "$GITHUB_OUTPUT" + echo "AMI_ID=$AMI_ID" >> "$GITHUB_ENV" + echo "GOLDEN_VERSION=$GOLDEN_VERSION" >> "$GITHUB_ENV" + + - name: Verify AMI tags + run: | + aws ec2 describe-images \ + --region "$AWS_REGION" \ + --image-ids "$AMI_ID" \ + --query 'Images[0].{ImageId:ImageId,Name:Name,State:State,Role:Tags[?Key==`opensandbox-role`].Value|[0],Cloud:Tags[?Key==`opensandbox-cloud`].Value|[0],Version:Tags[?Key==`opensandbox-version`].Value|[0]}' \ + --output table + + - name: Update worker AMI pointer + run: | + aws ssm put-parameter \ + --region "$AWS_REGION" \ + --name "$SSM_PARAMETER_PREFIX/worker-ami-id" \ + --type String \ + --value "$AMI_ID" \ + --overwrite + + aws ssm put-parameter \ + --region "$AWS_REGION" \ + --name "$SSM_PARAMETER_PREFIX/worker-ami-version" \ + --type String \ + --value "$VERSION" \ + --overwrite + + echo "Updated SSM worker AMI pointer: $SSM_PARAMETER_PREFIX/worker-ami-id -> $AMI_ID" + + - name: Cleanup old AMIs + if: ${{ inputs.cleanup_old_amis == 'true' }} + run: | + set -euo pipefail + + mapfile -t OLD_AMIS < <( + aws ec2 describe-images \ + --region "$AWS_REGION" \ + --owners self \ + --filters \ + "Name=tag:opensandbox-role,Values=worker" \ + "Name=tag:opensandbox-cloud,Values=aws" \ + "Name=state,Values=available" \ + --query 'reverse(sort_by(Images,&CreationDate))[10:].ImageId' \ + --output text | tr '\t' '\n' | awk 'NF' + ) + + if [ "${#OLD_AMIS[@]}" -eq 0 ]; then + echo "No old AMIs to clean up" + exit 0 + fi + + for AMI in "${OLD_AMIS[@]}"; do + echo "Deregistering old AMI: $AMI" + SNAPSHOTS=$(aws ec2 describe-images \ + --region "$AWS_REGION" \ + --image-ids "$AMI" \ + --query 'Images[0].BlockDeviceMappings[].Ebs.SnapshotId' \ + --output text) + aws ec2 deregister-image --region "$AWS_REGION" --image-id "$AMI" + for SNAPSHOT in $SNAPSHOTS; do + echo "Deleting old AMI snapshot: $SNAPSHOT" + aws ec2 delete-snapshot --region "$AWS_REGION" --snapshot-id "$SNAPSHOT" || true + done + done + + - name: Summary + run: | + echo "## AWS Worker AMI Build Complete" >> "$GITHUB_STEP_SUMMARY" + echo "- **AMI ID:** \`$AMI_ID\`" >> "$GITHUB_STEP_SUMMARY" + echo "- **Worker version:** \`$VERSION\`" >> "$GITHUB_STEP_SUMMARY" + echo "- **Agent version:** \`$AGENT_VERSION\`" >> "$GITHUB_STEP_SUMMARY" + echo "- **Golden version:** \`${GOLDEN_VERSION:-unknown}\`" >> "$GITHUB_STEP_SUMMARY" + echo "- **Region:** \`$AWS_REGION\`" >> "$GITHUB_STEP_SUMMARY" + echo "- **SSM pointer:** \`$SSM_PARAMETER_PREFIX/worker-ami-id\`" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + echo "The CP scaler reads this AMI through \`OPENSANDBOX_EC2_SSM_AMI_PARAM\`. Terraform creates the parameter but ignores value drift so later applies do not overwrite CI-published AMIs." >> "$GITHUB_STEP_SUMMARY" From 2270e73944992cbf62c1fad37630608850db226e Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Fri, 5 Jun 2026 18:49:38 -0700 Subject: [PATCH 25/32] Run AWS worker AMI build on branch push --- .github/workflows/build-aws-worker-ami.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/build-aws-worker-ami.yml b/.github/workflows/build-aws-worker-ami.yml index 97096fd3..b25b31ff 100644 --- a/.github/workflows/build-aws-worker-ami.yml +++ b/.github/workflows/build-aws-worker-ami.yml @@ -1,6 +1,9 @@ name: Build AWS Worker AMI on: + push: + branches: + - feat/aws-poc-worker-support-clean-rebased workflow_dispatch: inputs: region: From fcb775964bd67bbc216e91839ce8684468d43db7 Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Fri, 5 Jun 2026 19:19:30 -0700 Subject: [PATCH 26/32] Use generic AWS role secret for AMI workflow --- .github/workflows/build-aws-worker-ami.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-aws-worker-ami.yml b/.github/workflows/build-aws-worker-ami.yml index b25b31ff..a9bc4417 100644 --- a/.github/workflows/build-aws-worker-ami.yml +++ b/.github/workflows/build-aws-worker-ami.yml @@ -52,7 +52,7 @@ jobs: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: - role-to-assume: ${{ secrets.AWS_PROD_ROLE_ARN }} + role-to-assume: ${{ secrets.AWS_ROLE_ARN }} aws-region: ${{ env.AWS_REGION }} - name: Setup Packer From 344f7a6054cc828cd4dafadacb091e87cd1ddb74 Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Fri, 5 Jun 2026 21:51:46 -0700 Subject: [PATCH 27/32] Add AWS Burst CP deploy workflow --- .github/workflows/build-aws-worker-ami.yml | 1 + .github/workflows/deploy-aws-burst-cp.yml | 283 +++++++++++++++++++++ 2 files changed, 284 insertions(+) create mode 100644 .github/workflows/deploy-aws-burst-cp.yml diff --git a/.github/workflows/build-aws-worker-ami.yml b/.github/workflows/build-aws-worker-ami.yml index a9bc4417..303a06a7 100644 --- a/.github/workflows/build-aws-worker-ami.yml +++ b/.github/workflows/build-aws-worker-ami.yml @@ -3,6 +3,7 @@ name: Build AWS Worker AMI on: push: branches: + - main - feat/aws-poc-worker-support-clean-rebased workflow_dispatch: inputs: diff --git a/.github/workflows/deploy-aws-burst-cp.yml b/.github/workflows/deploy-aws-burst-cp.yml new file mode 100644 index 00000000..9ee25958 --- /dev/null +++ b/.github/workflows/deploy-aws-burst-cp.yml @@ -0,0 +1,283 @@ +name: Deploy AWS Burst Control Plane + +on: + push: + branches: + - main + - feat/aws-poc-worker-support-clean-rebased + paths: + - "cmd/server/**" + - "internal/**" + - "web/**" + - "deploy/vector/**" + - "go.mod" + - "go.sum" + - ".github/workflows/deploy-aws-burst-cp.yml" + workflow_dispatch: + inputs: + region: + description: AWS region + required: true + default: us-east-2 + cell_id: + description: Cell ID + required: true + default: aws-us-east-2-burst-prod + cp_domain: + description: Public control-plane domain + required: true + default: cp-aws-us-east-2-burst-prod.opencomputer.dev + +env: + AWS_REGION: ${{ inputs.region || vars.AWS_REGION || 'us-east-2' }} + CELL_ID: ${{ inputs.cell_id || 'aws-us-east-2-burst-prod' }} + CP_DOMAIN: ${{ inputs.cp_domain || 'cp-aws-us-east-2-burst-prod.opencomputer.dev' }} + GOARCH: amd64 + +jobs: + deploy: + name: Deploy AWS Burst CP + runs-on: ubuntu-latest + timeout-minutes: 30 + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-go@v5 + with: + go-version: "1.23" + + - uses: actions/setup-node@v4 + with: + node-version: "20" + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_ROLE_ARN }} + aws-region: ${{ env.AWS_REGION }} + + - name: Discover control plane + run: | + set -euo pipefail + + CP_INSTANCE_ID=$(aws ec2 describe-instances \ + --region "$AWS_REGION" \ + --filters \ + "Name=tag:Name,Values=oc-cp-$CELL_ID" \ + "Name=instance-state-name,Values=running" \ + --query 'Reservations[0].Instances[0].InstanceId' \ + --output text) + + if [ -z "$CP_INSTANCE_ID" ] || [ "$CP_INSTANCE_ID" = "None" ]; then + echo "No running CP instance found for $CELL_ID" + exit 1 + fi + + CP_PUBLIC_IP=$(aws ec2 describe-instances \ + --region "$AWS_REGION" \ + --instance-ids "$CP_INSTANCE_ID" \ + --query 'Reservations[0].Instances[0].PublicIpAddress' \ + --output text) + + CP_AZ=$(aws ec2 describe-instances \ + --region "$AWS_REGION" \ + --instance-ids "$CP_INSTANCE_ID" \ + --query 'Reservations[0].Instances[0].Placement.AvailabilityZone' \ + --output text) + + CP_SG_ID=$(aws ec2 describe-security-groups \ + --region "$AWS_REGION" \ + --filters "Name=group-name,Values=oc-sg-cp-$CELL_ID" \ + --query 'SecurityGroups[0].GroupId' \ + --output text) + + if [ -z "$CP_PUBLIC_IP" ] || [ "$CP_PUBLIC_IP" = "None" ]; then + echo "CP instance $CP_INSTANCE_ID has no public IP" + exit 1 + fi + + if [ -z "$CP_SG_ID" ] || [ "$CP_SG_ID" = "None" ]; then + echo "CP security group not found for $CELL_ID" + exit 1 + fi + + echo "CP_INSTANCE_ID=$CP_INSTANCE_ID" >> "$GITHUB_ENV" + echo "CP_PUBLIC_IP=$CP_PUBLIC_IP" >> "$GITHUB_ENV" + echo "CP_AZ=$CP_AZ" >> "$GITHUB_ENV" + echo "CP_SG_ID=$CP_SG_ID" >> "$GITHUB_ENV" + + - name: Build server binary + run: | + set -euo pipefail + VERSION=$(git rev-parse --short HEAD) + echo "VERSION=$VERSION" >> "$GITHUB_ENV" + CGO_ENABLED=0 GOOS=linux GOARCH="$GOARCH" go build \ + -ldflags "-X main.Version=$VERSION" \ + -o bin/opensandbox-server ./cmd/server/ + + - name: Build web dashboard + run: cd web && npm ci && npm run build + + - name: Package deploy artifacts + run: | + set -euo pipefail + tar czf bin/web-dist.tar.gz -C web dist + tar czf bin/vector-deploy.tar.gz -C deploy vector + + - name: Prepare SSH key + run: | + set -euo pipefail + mkdir -p ~/.ssh + printf '%s\n' "${{ secrets.AWS_CP_SSH_PRIVATE_KEY }}" > ~/.ssh/aws-burst-cp + chmod 600 ~/.ssh/aws-burst-cp + ssh-keygen -y -f ~/.ssh/aws-burst-cp > ~/.ssh/aws-burst-cp.pub + + - name: Authorize runner SSH + run: | + set -euo pipefail + RUNNER_IP=$(curl -fsSL https://checkip.amazonaws.com | tr -d '[:space:]') + echo "RUNNER_CIDR=${RUNNER_IP}/32" >> "$GITHUB_ENV" + + set +e + aws ec2 authorize-security-group-ingress \ + --region "$AWS_REGION" \ + --group-id "$CP_SG_ID" \ + --ip-permissions "IpProtocol=tcp,FromPort=22,ToPort=22,IpRanges=[{CidrIp=${RUNNER_IP}/32,Description=github-actions-cp-deploy-${GITHUB_RUN_ID}}]" + STATUS=$? + set -e + + if [ "$STATUS" -ne 0 ]; then + echo "SSH ingress may already exist; continuing so deploy can attempt SSH." + fi + + - name: Push SSH key with EC2 Instance Connect + run: | + set -euo pipefail + aws ec2-instance-connect send-ssh-public-key \ + --region "$AWS_REGION" \ + --instance-id "$CP_INSTANCE_ID" \ + --availability-zone "$CP_AZ" \ + --instance-os-user ubuntu \ + --ssh-public-key "file://$HOME/.ssh/aws-burst-cp.pub" + + - name: Wait for SSH + run: | + set -euo pipefail + for i in $(seq 1 30); do + if ssh -i ~/.ssh/aws-burst-cp \ + -o StrictHostKeyChecking=no \ + -o UserKnownHostsFile=/dev/null \ + -o ConnectTimeout=5 \ + ubuntu@"$CP_PUBLIC_IP" 'echo ssh-ready'; then + exit 0 + fi + sleep 5 + done + echo "SSH did not become ready" + exit 1 + + - name: Upload artifacts + run: | + set -euo pipefail + scp -i ~/.ssh/aws-burst-cp \ + -o StrictHostKeyChecking=no \ + -o UserKnownHostsFile=/dev/null \ + bin/opensandbox-server bin/web-dist.tar.gz bin/vector-deploy.tar.gz \ + ubuntu@"$CP_PUBLIC_IP":/tmp/ + + - name: Install and restart control plane + run: | + set -euo pipefail + ssh -i ~/.ssh/aws-burst-cp \ + -o StrictHostKeyChecking=no \ + -o UserKnownHostsFile=/dev/null \ + ubuntu@"$CP_PUBLIC_IP" 'bash -s' <<'REMOTE' + set -euo pipefail + + sudo mkdir -p /opt/opensandbox/web + sudo install -m 0755 /tmp/opensandbox-server /usr/local/bin/opensandbox-server + sudo tar xzf /tmp/web-dist.tar.gz -C /opt/opensandbox/web + + sudo tee /etc/systemd/system/opensandbox-server.service >/dev/null <<'UNIT' + [Unit] + Description=OpenComputer Control Plane + After=network-online.target cloud-init.target + Wants=network-online.target + + [Service] + Type=simple + WorkingDirectory=/opt/opensandbox + EnvironmentFile=/etc/opensandbox/server.env + ExecStart=/usr/local/bin/opensandbox-server + Restart=always + RestartSec=5 + + [Install] + WantedBy=multi-user.target + UNIT + + sudo systemctl daemon-reload + sudo systemctl enable opensandbox-server + sudo systemctl restart opensandbox-server + + mkdir -p /tmp/vector-deploy + tar xzf /tmp/vector-deploy.tar.gz -C /tmp/vector-deploy + sudo bash /tmp/vector-deploy/vector/install.sh control-plane || true + rm -rf /tmp/vector-deploy /tmp/opensandbox-server /tmp/web-dist.tar.gz /tmp/vector-deploy.tar.gz + + sudo systemctl is-active opensandbox-server + REMOTE + + - name: Health check + run: | + set -euo pipefail + + echo "Checking direct CP health..." + for i in $(seq 1 30); do + if curl -fsS --max-time 5 "http://${CP_PUBLIC_IP}:8080/health"; then + echo + break + fi + sleep 5 + done + + echo "Checking public domain health..." + for i in $(seq 1 30); do + if curl -fsS --max-time 10 "https://${CP_DOMAIN}/health"; then + echo + exit 0 + fi + sleep 5 + done + + echo "Public health check failed" + exit 1 + + - name: Revoke runner SSH + if: always() + run: | + set -euo pipefail + if [ -z "${RUNNER_CIDR:-}" ] || [ -z "${CP_SG_ID:-}" ]; then + echo "No runner SSH rule recorded; skipping revoke" + exit 0 + fi + + aws ec2 revoke-security-group-ingress \ + --region "$AWS_REGION" \ + --group-id "$CP_SG_ID" \ + --protocol tcp \ + --port 22 \ + --cidr "$RUNNER_CIDR" || true + + - name: Summary + if: always() + run: | + echo "## AWS Burst CP Deploy" >> "$GITHUB_STEP_SUMMARY" + echo "- **Cell:** \`$CELL_ID\`" >> "$GITHUB_STEP_SUMMARY" + echo "- **CP instance:** \`${CP_INSTANCE_ID:-unknown}\`" >> "$GITHUB_STEP_SUMMARY" + echo "- **CP public IP:** \`${CP_PUBLIC_IP:-unknown}\`" >> "$GITHUB_STEP_SUMMARY" + echo "- **Version:** \`${VERSION:-unknown}\`" >> "$GITHUB_STEP_SUMMARY" + echo "- **Domain:** \`https://$CP_DOMAIN\`" >> "$GITHUB_STEP_SUMMARY" From cb10a6a99188cf35a31b170bfe93326065584f91 Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Mon, 8 Jun 2026 14:20:22 -0700 Subject: [PATCH 28/32] Fix AWS burst worker bootstrap storage --- internal/compute/ec2.go | 47 +++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/internal/compute/ec2.go b/internal/compute/ec2.go index cf17c670..9a6e80ee 100644 --- a/internal/compute/ec2.go +++ b/internal/compute/ec2.go @@ -470,6 +470,9 @@ func (p *EC2Pool) buildUserData(opts MachineOpts) string { sb.WriteString("systemctl stop opensandbox-worker.service 2>/dev/null || true\n") sb.WriteString("systemctl disable opensandbox-worker.service 2>/dev/null || true\n") sb.WriteString("systemctl reset-failed opensandbox-worker.service 2>/dev/null || true\n\n") + sb.WriteString("systemctl stop opensandbox-server.service 2>/dev/null || true\n") + sb.WriteString("systemctl disable opensandbox-server.service 2>/dev/null || true\n") + sb.WriteString("systemctl reset-failed opensandbox-server.service 2>/dev/null || true\n\n") sb.WriteString("# Instance identity from EC2 metadata (IMDSv2)\n") sb.WriteString("TOKEN=$(curl -fsS -X PUT 'http://169.254.169.254/latest/api/token' -H 'X-aws-ec2-metadata-token-ttl-seconds: 300')\n") @@ -480,26 +483,25 @@ func (p *EC2Pool) buildUserData(opts MachineOpts) string { // NVMe instance store handling. Larger metal/x.gd instance families expose // multiple NVMe drives at /dev/nvme[1-N]n1; smaller instances rely on EBS // (the attached data volume). RAID 0 across instance store NVMe when present. - sb.WriteString("# Mount data: prefer NVMe instance store (RAID 0), else first EBS data volume\n") + sb.WriteString("# Mount data: prefer EC2 instance-store NVMe (RAID 0). Otherwise use root fs for /data.\n") sb.WriteString("if ! mountpoint -q /data 2>/dev/null; then\n") sb.WriteString(" mkdir -p /data\n") sb.WriteString(" ROOT_DEV=$(lsblk -no PKNAME $(findmnt -n -o SOURCE /) 2>/dev/null | head -1)\n") sb.WriteString(" NVME_DISKS=()\n") - sb.WriteString(" for d in /dev/nvme1n1 /dev/nvme2n1 /dev/nvme3n1 /dev/nvme4n1 /dev/nvme5n1; do\n") - sb.WriteString(" [ -b \"$d\" ] || continue\n") - sb.WriteString(" [ \"$(basename $d)\" = \"$ROOT_DEV\" ] && continue\n") - sb.WriteString(" NVME_DISKS+=(\"$d\")\n") - sb.WriteString(" done\n") + sb.WriteString(" while read -r name model; do\n") + sb.WriteString(" [ -n \"${name:-}\" ] || continue\n") + sb.WriteString(" [ \"$name\" = \"$ROOT_DEV\" ] && continue\n") + sb.WriteString(" [ \"$model\" = \"Amazon EC2 NVMe Instance Storage\" ] || continue\n") + sb.WriteString(" NVME_DISKS+=(\"/dev/$name\")\n") + sb.WriteString(" done < <(lsblk -dn -o NAME,MODEL)\n") + sb.WriteString(" if [ ${#NVME_DISKS[@]} -eq 0 ]; then\n") + sb.WriteString(" echo 'No EC2 instance-store NVMe found; using root filesystem for /data'\n") + sb.WriteString(" fi\n") sb.WriteString(" if [ ${#NVME_DISKS[@]} -gt 1 ]; then\n") sb.WriteString(" mdadm --create /dev/md0 --level=0 --raid-devices=${#NVME_DISKS[@]} \"${NVME_DISKS[@]}\" --run --force\n") sb.WriteString(" mkfs.xfs -f -m reflink=1 /dev/md0 && mount /dev/md0 /data\n") sb.WriteString(" elif [ ${#NVME_DISKS[@]} -eq 1 ]; then\n") sb.WriteString(" mkfs.xfs -f -m reflink=1 \"${NVME_DISKS[0]}\" && mount \"${NVME_DISKS[0]}\" /data\n") - sb.WriteString(" else\n") - sb.WriteString(" for d in /dev/nvme1n1 /dev/sdb /dev/xvdb; do\n") - sb.WriteString(" [ -b \"$d\" ] || continue\n") - sb.WriteString(" mkfs.xfs -f -m reflink=1 \"$d\" && mount \"$d\" /data && break\n") - sb.WriteString(" done\n") sb.WriteString(" fi\n") sb.WriteString("fi\n") sb.WriteString("mkdir -p /data/sandboxes /data/firecracker/images\n") @@ -591,6 +593,9 @@ func (p *EC2Pool) sharedSandboxDataUserData() string { sb.WriteString(" sleep 1\n") sb.WriteString("done\n") sb.WriteString("if [ -z \"${SANDBOX_DEV:-}\" ]; then echo \"ERROR: shared sandbox data volume not attached\"; lsblk -o NAME,MODEL,SERIAL,SIZE,FSTYPE,MOUNTPOINT || true; exit 1; fi\n") + sb.WriteString("SANDBOX_SERIAL=$(lsblk -dn -o SERIAL \"$SANDBOX_DEV\" 2>/dev/null | head -1 || true)\n") + sb.WriteString("if [ \"$SANDBOX_SERIAL\" != \"$SANDBOX_VOL_NO_DASH\" ]; then echo \"ERROR: $SANDBOX_DEV serial $SANDBOX_SERIAL does not match sandbox volume $SANDBOX_VOLUME_ID\"; lsblk -o NAME,MODEL,SERIAL,SIZE,FSTYPE,MOUNTPOINT || true; exit 1; fi\n") + sb.WriteString("echo \"Using shared sandbox data volume $SANDBOX_VOLUME_ID at $SANDBOX_DEV\"\n") sb.WriteString("mapfile -t OCFS2_NODES < <(for i in $(seq 1 60); do aws ec2 describe-instances --region " + shellQuote(p.cfg.Region) + " --filters \"Name=tag:Cell,Values=" + shellEscapedDouble(p.cfg.CellID) + "\" \"Name=tag:Role,Values=worker\" \"Name=instance-state-name,Values=running\" --query 'Reservations[].Instances[].PrivateDnsName' --output text | tr '\\t' '\\n' | awk 'NF { sub(/\\..*/, \"\", $0); print }' | sort -u; break; done)\n") sb.WriteString("for i in $(seq 1 60); do\n") sb.WriteString(" [ \"${#OCFS2_NODES[@]}\" -ge \"$OCFS2_EXPECTED_NODES\" ] && break\n") @@ -608,8 +613,10 @@ func (p *EC2Pool) sharedSandboxDataUserData() string { sb.WriteString("mkdir -p /data/sandboxes\n") sb.WriteString("FSTYPE=$(blkid -s TYPE -o value \"$SANDBOX_DEV\" 2>/dev/null || true)\n") sb.WriteString("if [ -z \"$FSTYPE\" ]; then mkfs.ocfs2 -F -N \"$OCFS2_MAX_NODES\" -L opensandbox-sandboxes -T vmstore \"$SANDBOX_DEV\"; fi\n") + sb.WriteString("FSTYPE=$(blkid -s TYPE -o value \"$SANDBOX_DEV\" 2>/dev/null || true)\n") + sb.WriteString("if [ \"$FSTYPE\" != \"ocfs2\" ]; then echo \"ERROR: shared sandbox data volume $SANDBOX_DEV has filesystem '$FSTYPE', expected ocfs2\"; lsblk -o NAME,MODEL,SERIAL,SIZE,FSTYPE,MOUNTPOINT || true; exit 1; fi\n") sb.WriteString("if ! grep -q 'LABEL=opensandbox-sandboxes' /etc/fstab; then echo 'LABEL=opensandbox-sandboxes /data/sandboxes ocfs2 noauto,_netdev,noatime 0 0' >> /etc/fstab; fi\n") - sb.WriteString("timeout 90 mount -t ocfs2 -o noatime LABEL=opensandbox-sandboxes /data/sandboxes\n") + sb.WriteString("timeout 90 mount -t ocfs2 -o noatime \"$SANDBOX_DEV\" /data/sandboxes\n") sb.WriteString("chown root:root /data/sandboxes\n\n") return sb.String() } @@ -624,10 +631,24 @@ func (p *EC2Pool) sharedGoldensUserData() string { sb.WriteString("GOLDENS_VOL_NO_DASH=\"${GOLDENS_VOLUME_ID//-/}\"\n") sb.WriteString("for i in $(seq 1 120); do\n") sb.WriteString(" if [ -e \"/dev/disk/by-id/nvme-Amazon_Elastic_Block_Store_${GOLDENS_VOL_NO_DASH}\" ]; then GOLDENS_DEV=$(readlink -f \"/dev/disk/by-id/nvme-Amazon_Elastic_Block_Store_${GOLDENS_VOL_NO_DASH}\"); fi\n") + sb.WriteString(" if [ -z \"${GOLDENS_DEV:-}\" ] && [ -e \"/dev/disk/by-id/nvme-Amazon_Elastic_Block_Store_${GOLDENS_VOL_NO_DASH}_1\" ]; then GOLDENS_DEV=$(readlink -f \"/dev/disk/by-id/nvme-Amazon_Elastic_Block_Store_${GOLDENS_VOL_NO_DASH}_1\"); fi\n") + sb.WriteString(" if [ -z \"${GOLDENS_DEV:-}\" ]; then GOLDENS_DEV=$(lsblk -dn -o NAME,SERIAL | awk -v v=\"$GOLDENS_VOL_NO_DASH\" '$2 == v {print \"/dev/\"$1; exit}'); fi\n") sb.WriteString(" [ -n \"${GOLDENS_DEV:-}\" ] && break\n") sb.WriteString(" sleep 1\n") sb.WriteString("done\n") - sb.WriteString("if [ -n \"${GOLDENS_DEV:-}\" ]; then mount -o ro,noload \"$GOLDENS_DEV\" /opt/opensandbox/goldens-shared || true; fi\n") + sb.WriteString("if [ -n \"${GOLDENS_DEV:-}\" ]; then\n") + sb.WriteString(" GOLDENS_SERIAL=$(lsblk -dn -o SERIAL \"$GOLDENS_DEV\" 2>/dev/null | head -1 || true)\n") + sb.WriteString(" if [ \"$GOLDENS_SERIAL\" != \"$GOLDENS_VOL_NO_DASH\" ]; then echo \"WARN: $GOLDENS_DEV serial $GOLDENS_SERIAL does not match golden volume $GOLDENS_VOLUME_ID\"; GOLDENS_DEV=\"\"; fi\n") + sb.WriteString("fi\n") + sb.WriteString("if [ -n \"${GOLDENS_DEV:-}\" ]; then\n") + sb.WriteString(" GOLDENS_FSTYPE=$(blkid -s TYPE -o value \"$GOLDENS_DEV\" 2>/dev/null || true)\n") + sb.WriteString(" case \"$GOLDENS_FSTYPE\" in\n") + sb.WriteString(" ext2|ext3|ext4) mount -t \"$GOLDENS_FSTYPE\" -o ro,noload,noatime \"$GOLDENS_DEV\" /opt/opensandbox/goldens-shared || true ;;\n") + sb.WriteString(" xfs) mount -t xfs -o ro,noatime \"$GOLDENS_DEV\" /opt/opensandbox/goldens-shared || true ;;\n") + sb.WriteString(" '') echo \"WARN: shared golden volume $GOLDENS_VOLUME_ID has no filesystem; continuing without it\" ;;\n") + sb.WriteString(" *) echo \"WARN: shared golden volume $GOLDENS_VOLUME_ID has unsupported filesystem '$GOLDENS_FSTYPE'; continuing without it\" ;;\n") + sb.WriteString(" esac\n") + sb.WriteString("fi\n") sb.WriteString("if [ -d /opt/opensandbox/goldens-shared/golden ]; then ln -sfn /opt/opensandbox/goldens-shared/golden /var/lib/opensandbox/golden; fi\n\n") return sb.String() } From f69aa399c28e6918895af5c566fc3fa6e476873d Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Mon, 8 Jun 2026 17:58:37 -0700 Subject: [PATCH 29/32] Add burst compute billing meter --- internal/api/capacity.go | 4 +- internal/billing/billable_events_sender.go | 7 ++- .../billing/billable_events_sender_test.go | 15 +++++- internal/billing/capacity_reconciler.go | 45 ++++++++++++++--- internal/billing/capacity_reconciler_test.go | 13 +++++ internal/billing/pricing.go | 10 +++- internal/billing/stripe.go | 19 +++++-- internal/db/billable_events.go | 49 ++++++++++++------- .../db/migrations/030_billable_events.up.sql | 4 +- .../045_burst_billable_events.down.sql | 6 +++ .../045_burst_billable_events.up.sql | 6 +++ internal/db/store.go | 1 + internal/db/usage.go | 1 + 13 files changed, 142 insertions(+), 38 deletions(-) create mode 100644 internal/db/migrations/045_burst_billable_events.down.sql create mode 100644 internal/db/migrations/045_burst_billable_events.up.sql diff --git a/internal/api/capacity.go b/internal/api/capacity.go index d38bb2d8..6233c5e1 100644 --- a/internal/api/capacity.go +++ b/internal/api/capacity.go @@ -249,11 +249,11 @@ func (s *Server) getCapacityBillableEvents(c echo.Context) error { eventType := c.QueryParam("eventType") switch eventType { - case "", db.BillableEventReservedUsage, db.BillableEventOverageUsage, db.BillableEventDiskOverageUsage: + case "", db.BillableEventReservedUsage, db.BillableEventOverageUsage, db.BillableEventBurstUsage, db.BillableEventDiskOverageUsage: // ok default: return c.JSON(http.StatusBadRequest, map[string]string{ - "error": "eventType must be one of reserved_usage, overage_usage, disk_overage_usage (or omitted for all)", + "error": "eventType must be one of reserved_usage, overage_usage, burst_usage, disk_overage_usage (or omitted for all)", }) } diff --git a/internal/billing/billable_events_sender.go b/internal/billing/billable_events_sender.go index 0dd17c5e..9c2ea3af 100644 --- a/internal/billing/billable_events_sender.go +++ b/internal/billing/billable_events_sender.go @@ -138,7 +138,7 @@ func (s *BillableEventsSender) shipOne(ctx context.Context, p db.PendingBillable } // meterEventNameFor maps an outbox event_type → Stripe meter event_name. -// All three event types route to a single meter each — overage is flat +// All event types route to a single meter each — overage and burst are flat // (the per-tier memory_mb on the outbox row is preserved for analytics // but ignored when shipping; Stripe sums GB-seconds across rows). func (s *BillableEventsSender) meterEventNameFor(eventType string, memoryMB int) (string, error) { @@ -153,6 +153,11 @@ func (s *BillableEventsSender) meterEventNameFor(eventType string, memoryMB int) return "", fmt.Errorf("overage meter not provisioned (run EnsureProducts)") } return s.stripe.OverageMeterEventName, nil + case db.BillableEventBurstUsage: + if s.stripe.BurstMeterEventName == "" { + return "", fmt.Errorf("burst meter not provisioned (run EnsureProducts)") + } + return s.stripe.BurstMeterEventName, nil case db.BillableEventDiskOverageUsage: if s.stripe.DiskOverageMeterEventName == "" { return "", fmt.Errorf("disk overage meter not provisioned") diff --git a/internal/billing/billable_events_sender_test.go b/internal/billing/billable_events_sender_test.go index 5d7d9d96..9991f3df 100644 --- a/internal/billing/billable_events_sender_test.go +++ b/internal/billing/billable_events_sender_test.go @@ -10,12 +10,13 @@ import ( // meterEventNameFor pins the (event_type, memory_mb) → Stripe meter // routing. Overage is flat — memory_mb is preserved on the outbox row // for analytics but ignored at ship time so a 1 GB and 64 GB sandbox -// hit the same meter. +// hit the same meter. Burst is also flat, but routed to its own meter. func newSenderForTest() *BillableEventsSender { stripe := &StripeClient{ ReservedMeterEventName: "sandbox_compute_sandbox_reserved", OverageMeterEventName: "sandbox_compute_sandbox_overage", + BurstMeterEventName: "sandbox_compute_sandbox_burst", DiskOverageMeterEventName: "sandbox_compute_sandbox_disk_overage", } return &BillableEventsSender{stripe: stripe} @@ -56,6 +57,17 @@ func TestSender_meterEventName_diskOverage(t *testing.T) { } } +func TestSender_meterEventName_burstFlat(t *testing.T) { + s := newSenderForTest() + got, err := s.meterEventNameFor(db.BillableEventBurstUsage, 0) + if err != nil { + t.Fatalf("err: %v", err) + } + if got != "sandbox_compute_sandbox_burst" { + t.Errorf("got %q", got) + } +} + func TestSender_meterEventName_unknownTypeRejected(t *testing.T) { s := newSenderForTest() _, err := s.meterEventNameFor("totally_made_up", 0) @@ -74,6 +86,7 @@ func TestSender_meterEventName_missingProvisionRejected(t *testing.T) { }{ {db.BillableEventReservedUsage, "reserved meter not provisioned"}, {db.BillableEventOverageUsage, "overage meter not provisioned"}, + {db.BillableEventBurstUsage, "burst meter not provisioned"}, {db.BillableEventDiskOverageUsage, "disk overage meter not provisioned"}, } for _, tc := range cases { diff --git a/internal/billing/capacity_reconciler.go b/internal/billing/capacity_reconciler.go index 5b99dd5e..32b9f964 100644 --- a/internal/billing/capacity_reconciler.go +++ b/internal/billing/capacity_reconciler.go @@ -14,7 +14,7 @@ import ( // it scans for closed 15-min buckets that are at least `settle` past // their end and haven't been processed yet, runs the per-second // integration walk for each (org, bucket), and emits the resulting -// reserved_usage / overage_usage / disk_overage_usage rows to the +// reserved_usage / overage_usage / burst_usage / disk_overage_usage rows to the // `billable_events` outbox. // // Phase 2 runs in shadow: the rows are written but not delivered to @@ -152,6 +152,7 @@ func (r *CapacityReconciler) processBucket(ctx context.Context, orgID uuid.UUID, // is org-level, summed across all running events at each segment. type BucketTotals struct { OverageGBSecondsByTier map[int]float64 + BurstGBSeconds float64 DiskOverageGBSeconds float64 ReservedFloorGBSeconds float64 // reservedGb × secs accumulated across segments — for shadow validation only } @@ -176,6 +177,7 @@ type clippedEvent struct { From, To time.Time TierMB int DiskMB int + Burst bool } // clipEvent restricts a ScaleEvent's lifetime to the bucket window. @@ -192,7 +194,7 @@ func clipEvent(e db.ScaleEvent, bucketStart, bucketEnd time.Time) (clippedEvent, if !to.After(from) { return clippedEvent{}, false } - return clippedEvent{From: from, To: to, TierMB: e.MemoryMB, DiskMB: e.DiskMB}, true + return clippedEvent{From: from, To: to, TierMB: e.MemoryMB, DiskMB: e.DiskMB, Burst: e.Burst}, true } func collectBoundaries(events []clippedEvent, bucketStart, bucketEnd time.Time) []time.Time { @@ -216,9 +218,10 @@ func collectBoundaries(events []clippedEvent, bucketStart, bucketEnd time.Time) } type segment struct { - From, To time.Time - RunningByTier map[int]int // tier_mb → running GB at this tier - DiskOverageMB int // sum of (disk_mb − 20480) over running events + From, To time.Time + RunningByTier map[int]int // non-burst tier_mb → running GB at this tier + BurstGB int + DiskOverageMB int // sum of (disk_mb − 20480) over running events } func walkSegments(boundaries []time.Time, events []clippedEvent) []segment { @@ -229,6 +232,7 @@ func walkSegments(boundaries []time.Time, events []clippedEvent) []segment { continue } tiers := map[int]int{} + burstGB := 0 diskOver := 0 for _, e := range events { // Event is "running" in [from, to) if it covers the segment @@ -236,13 +240,18 @@ func walkSegments(boundaries []time.Time, events []clippedEvent) []segment { // boundary set includes every event endpoint, every segment // is fully contained in any event whose span covers `from`. if !e.From.After(from) && !e.To.Before(to) { - tiers[e.TierMB] += e.TierMB / 1024 + gb := e.TierMB / 1024 + if e.Burst { + burstGB += gb + } else { + tiers[e.TierMB] += gb + } if e.DiskMB > 20480 { diskOver += e.DiskMB - 20480 } } } - segs = append(segs, segment{From: from, To: to, RunningByTier: tiers, DiskOverageMB: diskOver}) + segs = append(segs, segment{From: from, To: to, RunningByTier: tiers, BurstGB: burstGB, DiskOverageMB: diskOver}) } return segs } @@ -251,6 +260,10 @@ func integrateSegments(segs []segment, reservedGB int) BucketTotals { out := BucketTotals{OverageGBSecondsByTier: map[int]float64{}} for _, s := range segs { secs := s.To.Sub(s.From).Seconds() + if s.BurstGB > 0 { + out.BurstGBSeconds += float64(s.BurstGB) * secs + } + usage := 0 for _, gb := range s.RunningByTier { usage += gb @@ -281,7 +294,9 @@ func integrateSegments(segs []segment, reservedGB int) BucketTotals { // actual usage — the customer paid for the floor whether or not // they used it). // - overage_usage — one row per (org, sandbox_tier, bucket) where -// the tier's overage contribution is non-zero. +// the non-burst tier's overage contribution is non-zero. +// - burst_usage — one row per (org, bucket) when burst sandboxes ran. +// Burst is billed independently and does not consume reserved floors. // - disk_overage_usage — one row per (org, bucket) when any sandbox // in the bucket exceeded the 20 GB allowance. // @@ -319,6 +334,20 @@ func emitBucket(ctx context.Context, store *db.Store, orgID uuid.UUID, bucketSta } } + if totals.BurstGBSeconds > 0 { + ev := db.BillableEvent{ + OrgID: orgID, + EventType: db.BillableEventBurstUsage, + MemoryMB: 0, + GBSeconds: totals.BurstGBSeconds, + BucketStart: bucketStart, + BucketEnd: bucketEnd, + } + if _, err := store.UpsertBillableEvent(ctx, ev); err != nil { + return err + } + } + if totals.DiskOverageGBSeconds > 0 { ev := db.BillableEvent{ OrgID: orgID, diff --git a/internal/billing/capacity_reconciler_test.go b/internal/billing/capacity_reconciler_test.go index 75671d11..67383a45 100644 --- a/internal/billing/capacity_reconciler_test.go +++ b/internal/billing/capacity_reconciler_test.go @@ -66,6 +66,19 @@ func TestIntegrateBucket_zeroReservation_replaysLegacy(t *testing.T) { } } +func TestIntegrateBucket_burstUsageSeparateFromOverage(t *testing.T) { + bs, be := canonicalBucket() + totals := IntegrateBucket(bs, be, 8, []db.ScaleEvent{ + {MemoryMB: 8192, DiskMB: 20480, Burst: true, StartedAt: bs, EndedAt: &be}, + {MemoryMB: 8192, DiskMB: 20480, StartedAt: bs, EndedAt: &be}, + }) + eq(t, "burst", totals.BurstGBSeconds, 8*900) + eq(t, "reserved floor", totals.ReservedFloorGBSeconds, 8*900) + if len(totals.OverageGBSecondsByTier) != 0 { + t.Errorf("expected burst to stay out of overage, got %v", totals.OverageGBSecondsByTier) + } +} + func TestIntegrateBucket_fullCoverage_noOverage(t *testing.T) { bs, be := canonicalBucket() // One 8 GB sandbox, reservation covers it. diff --git a/internal/billing/pricing.go b/internal/billing/pricing.go index 62db397b..8fed72a1 100644 --- a/internal/billing/pricing.go +++ b/internal/billing/pricing.go @@ -54,8 +54,8 @@ var TierPriceKey = map[int]string{ // full lifetime of the sandbox (running OR hibernated, since the workspace // qcow2 still occupies host disk). const ( - DiskFreeAllowanceMB = 20480 // 20GB included with every sandbox - DiskOveragePricePerGBPerSecond = 0.0000001 // ~$0.26 per GB-month + DiskFreeAllowanceMB = 20480 // 20GB included with every sandbox + DiskOveragePricePerGBPerSecond = 0.0000001 // ~$0.26 per GB-month DiskOverageMetadataKey = "sandbox_disk_overage" ) @@ -68,6 +68,8 @@ const ( // reserve capacity pay this. // - Overage: customer-facing name "instant." Flat per-GB rate // regardless of sandbox tier. +// - Burst: customer-facing name "burst." Flat per-GB rate for +// restartable burst sandboxes, independent of reserved capacity. // // **Dollar rates are deliberately not in code.** Stripe Prices are // configured in the Stripe Dashboard (or via Stripe API by the @@ -92,6 +94,10 @@ const ( // Instant (overage): single meter, flat across all sandbox sizes. OverageMeterKey = "sandbox_overage" OveragePriceKey = "sandbox_overage_v1" + + // Burst: single meter, flat across all sandbox sizes. + BurstMeterKey = "sandbox_burst" + BurstPriceKey = "sandbox_burst_v1" ) // DiskOverageGBSeconds returns the chargeable GB-seconds for one usage summary diff --git a/internal/billing/stripe.go b/internal/billing/stripe.go index a66b391d..0bb2749a 100644 --- a/internal/billing/stripe.go +++ b/internal/billing/stripe.go @@ -32,17 +32,19 @@ type StripeClient struct { MeterEventNames map[int]string // Disk overage meter / price (single dimension: GB-seconds above 20GB). - DiskOveragePriceID string + DiskOveragePriceID string DiskOverageMeterEventName string - // Phase-3 unified-pipeline meters and prices. Two flat meters - // (overage + reserved) used by new orgs (`billing_mode='unified'`). + // Phase-3 unified-pipeline meters and prices. Flat meters + // (overage + reserved + burst) used by new orgs (`billing_mode='unified'`). // Legacy per-tier meters above are untouched and continue to serve // existing orgs via UsageReporter. OveragePriceID string // flat overage Price for `overage_usage` events OverageMeterEventName string // "sandbox_compute_sandbox_overage" ReservedPriceID string // flat reserved Price for `reserved_usage` events ReservedMeterEventName string // "sandbox_compute_sandbox_reserved" + BurstPriceID string // flat burst Price for `burst_usage` events + BurstMeterEventName string // "sandbox_compute_sandbox_burst" // Per-agent paywalled-feature prices. Configured from env and // referenced by name from the dashboard subscribe handlers. Empty @@ -236,8 +238,8 @@ func (s *StripeClient) EnsureProducts() error { log.Printf("billing: created disk overage price (id=%s)", p.ID) } - // 5. Phase-3 unified-pipeline meters. Two flat meters (overage + - // reserved) at unit GB-seconds. Code creates the *meters* (their + // 5. Phase-3 unified-pipeline meters. Flat meters (overage, reserved, + // and burst) at unit GB-seconds. Code creates the *meters* (their // event names are stable wire-protocol coupling) but **does not // create Prices** — those are configured in the Stripe Dashboard // so pricing changes don't need a code deploy. The Price IDs @@ -246,6 +248,7 @@ func (s *StripeClient) EnsureProducts() error { // linked to the meter in Stripe. s.OverageMeterEventName = ensureMeter(existingMeters, "sandbox_compute_"+OverageMeterKey, "Sandbox Instant Compute (GB-seconds)") s.ReservedMeterEventName = ensureMeter(existingMeters, "sandbox_compute_"+ReservedMeterKey, "Sandbox Reserved Capacity (GB-seconds)") + s.BurstMeterEventName = ensureMeter(existingMeters, "sandbox_compute_"+BurstMeterKey, "Sandbox Burst Compute (GB-seconds)") if id, ok := existingPrices[OveragePriceKey]; ok { s.OveragePriceID = id @@ -259,6 +262,12 @@ func (s *StripeClient) EnsureProducts() error { } else { log.Printf("billing: no reserved price configured for meter %s — meter events will flow but won't appear on invoices until a Price is created in Stripe", s.ReservedMeterEventName) } + if id, ok := existingPrices[BurstPriceKey]; ok { + s.BurstPriceID = id + log.Printf("billing: found existing burst price (id=%s)", id) + } else { + log.Printf("billing: no burst price configured for meter %s — meter events will flow but won't appear on invoices until a Price is created in Stripe", s.BurstMeterEventName) + } return nil } diff --git a/internal/db/billable_events.go b/internal/db/billable_events.go index a0bc014a..3d42672e 100644 --- a/internal/db/billable_events.go +++ b/internal/db/billable_events.go @@ -19,9 +19,10 @@ import ( // Event types written to billable_events.event_type. Mirror the schema // CHECK constraint in migration 030. const ( - BillableEventReservedUsage = "reserved_usage" - BillableEventOverageUsage = "overage_usage" - BillableEventDiskOverageUsage = "disk_overage_usage" + BillableEventReservedUsage = "reserved_usage" + BillableEventOverageUsage = "overage_usage" + BillableEventBurstUsage = "burst_usage" + BillableEventDiskOverageUsage = "disk_overage_usage" ) // Delivery states. Mirror the schema CHECK constraint in migration 030. @@ -33,22 +34,23 @@ const ( // BillableEvent is one outbox row. // -// `MemoryMB` is 0 for `reserved_usage` and `disk_overage_usage` (sentinel -// for "not a sandbox tier"), and the running sandbox tier for +// `MemoryMB` is 0 for `reserved_usage`, `burst_usage`, and +// `disk_overage_usage` (sentinel for "not a sandbox tier"), and the +// running sandbox tier for // `overage_usage` (one row per tier per bucket via the proportional split // rule — see ws-pricing/work/001 "Per-second integration walk"). type BillableEvent struct { - ID uuid.UUID `json:"id"` - OrgID uuid.UUID `json:"orgId"` - EventType string `json:"eventType"` - MemoryMB int `json:"memoryMB"` - GBSeconds float64 `json:"gbSeconds"` - BucketStart time.Time `json:"bucketStart"` - BucketEnd time.Time `json:"bucketEnd"` - DeliveryState string `json:"deliveryState"` - StripeEventID *string `json:"stripeEventId,omitempty"` - CreatedAt time.Time `json:"createdAt"` - DeliveredAt *time.Time `json:"deliveredAt,omitempty"` + ID uuid.UUID `json:"id"` + OrgID uuid.UUID `json:"orgId"` + EventType string `json:"eventType"` + MemoryMB int `json:"memoryMB"` + GBSeconds float64 `json:"gbSeconds"` + BucketStart time.Time `json:"bucketStart"` + BucketEnd time.Time `json:"bucketEnd"` + DeliveryState string `json:"deliveryState"` + StripeEventID *string `json:"stripeEventId,omitempty"` + CreatedAt time.Time `json:"createdAt"` + DeliveredAt *time.Time `json:"deliveredAt,omitempty"` } // UpsertBillableEvent inserts a new outbox row, or no-ops if a row with @@ -304,7 +306,18 @@ func (s *Store) GetReservedGBForBucket(ctx context.Context, orgID uuid.UUID, buc // metering. See the same NOT EXISTS in GetOrgUsage for rationale. func (s *Store) GetScaleEventsForBucket(ctx context.Context, orgID uuid.UUID, bucketStart, bucketEnd time.Time) ([]ScaleEvent, error) { rows, err := s.pool.Query(ctx, ` - SELECT id, sandbox_id, org_id, memory_mb, cpu_percent, disk_mb, started_at, ended_at + SELECT id, sandbox_id, org_id, memory_mb, cpu_percent, disk_mb, + EXISTS ( + SELECT 1 + FROM sandbox_sessions ss + WHERE ss.sandbox_id = se.sandbox_id + AND ( + COALESCE((ss.config->>'burst')::boolean, false) + OR COALESCE((ss.config->>'resumable')::boolean, false) + OR ss.config->>'sandboxFamily' IN ('spot', 'resumable', 'burst') + ) + ) AS burst, + started_at, ended_at FROM sandbox_scale_events se WHERE org_id = $1 AND started_at < $3 @@ -330,7 +343,7 @@ func (s *Store) GetScaleEventsForBucket(ctx context.Context, orgID uuid.UUID, bu for rows.Next() { var e ScaleEvent var orgUUID uuid.UUID - if err := rows.Scan(&e.ID, &e.SandboxID, &orgUUID, &e.MemoryMB, &e.CPUPct, &e.DiskMB, &e.StartedAt, &e.EndedAt); err != nil { + if err := rows.Scan(&e.ID, &e.SandboxID, &orgUUID, &e.MemoryMB, &e.CPUPct, &e.DiskMB, &e.Burst, &e.StartedAt, &e.EndedAt); err != nil { return nil, fmt.Errorf("scan scale event: %w", err) } e.OrgID = orgUUID.String() diff --git a/internal/db/migrations/030_billable_events.up.sql b/internal/db/migrations/030_billable_events.up.sql index 443da3eb..d74f27a9 100644 --- a/internal/db/migrations/030_billable_events.up.sql +++ b/internal/db/migrations/030_billable_events.up.sql @@ -13,6 +13,8 @@ -- 'overage_usage' — instant usage above the reserved floor, -- emitted per running sandbox tier with the -- tier as memory_mb +-- 'burst_usage' — burst sandbox compute, billed as GB-seconds +-- independent of reserved capacity; memory_mb = 0 -- 'disk_overage_usage' — disk above the 20 GB allowance, org-level -- per bucket; memory_mb = 0 CREATE TABLE IF NOT EXISTS billable_events ( @@ -28,7 +30,7 @@ CREATE TABLE IF NOT EXISTS billable_events ( created_at TIMESTAMPTZ NOT NULL DEFAULT now(), delivered_at TIMESTAMPTZ, UNIQUE (org_id, event_type, memory_mb, bucket_start), - CHECK (event_type IN ('reserved_usage', 'overage_usage', 'disk_overage_usage')), + CHECK (event_type IN ('reserved_usage', 'overage_usage', 'burst_usage', 'disk_overage_usage')), CHECK (delivery_state IN ('pending', 'sent', 'failed')), CHECK (gb_seconds > 0), CHECK (memory_mb >= 0), diff --git a/internal/db/migrations/045_burst_billable_events.down.sql b/internal/db/migrations/045_burst_billable_events.down.sql new file mode 100644 index 00000000..73c29f22 --- /dev/null +++ b/internal/db/migrations/045_burst_billable_events.down.sql @@ -0,0 +1,6 @@ +ALTER TABLE billable_events + DROP CONSTRAINT IF EXISTS billable_events_event_type_check; + +ALTER TABLE billable_events + ADD CONSTRAINT billable_events_event_type_check + CHECK (event_type IN ('reserved_usage', 'overage_usage', 'disk_overage_usage')); diff --git a/internal/db/migrations/045_burst_billable_events.up.sql b/internal/db/migrations/045_burst_billable_events.up.sql new file mode 100644 index 00000000..bcc49b37 --- /dev/null +++ b/internal/db/migrations/045_burst_billable_events.up.sql @@ -0,0 +1,6 @@ +ALTER TABLE billable_events + DROP CONSTRAINT IF EXISTS billable_events_event_type_check; + +ALTER TABLE billable_events + ADD CONSTRAINT billable_events_event_type_check + CHECK (event_type IN ('reserved_usage', 'overage_usage', 'burst_usage', 'disk_overage_usage')); diff --git a/internal/db/store.go b/internal/db/store.go index a1862bad..5ae24aeb 100644 --- a/internal/db/store.go +++ b/internal/db/store.go @@ -142,6 +142,7 @@ func (s *Store) Migrate(ctx context.Context) error { {42, "migrations/042_global_sync_outbox.up.sql"}, {43, "migrations/043_credit_halt.up.sql"}, {44, "migrations/044_drop_secret_store_fk.up.sql"}, + {45, "migrations/045_burst_billable_events.up.sql"}, } for _, m := range migrations { diff --git a/internal/db/usage.go b/internal/db/usage.go index 5f172ce6..ae08d729 100644 --- a/internal/db/usage.go +++ b/internal/db/usage.go @@ -15,6 +15,7 @@ type ScaleEvent struct { MemoryMB int `json:"memoryMB"` CPUPct int `json:"cpuPercent"` DiskMB int `json:"diskMB"` + Burst bool `json:"burst"` StartedAt time.Time `json:"startedAt"` EndedAt *time.Time `json:"endedAt,omitempty"` } From 97057ff4f536ddc1a9db0d726cff7e957a8595a2 Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Wed, 10 Jun 2026 09:40:09 -0700 Subject: [PATCH 30/32] Bake burst worker AMI dependencies --- .github/workflows/build-aws-worker-ami.yml | 9 +- ...s.pkr.hcl => worker-ami-aws-burst.pkr.hcl} | 110 +++++++++++++----- deploy/vector/populate-vector-env.sh | 2 +- internal/compute/ec2.go | 29 +++-- 4 files changed, 110 insertions(+), 40 deletions(-) rename deploy/packer/{worker-ami-aws.pkr.hcl => worker-ami-aws-burst.pkr.hcl} (64%) diff --git a/.github/workflows/build-aws-worker-ami.yml b/.github/workflows/build-aws-worker-ami.yml index 303a06a7..b995bc17 100644 --- a/.github/workflows/build-aws-worker-ami.yml +++ b/.github/workflows/build-aws-worker-ami.yml @@ -89,7 +89,7 @@ jobs: run: tar czf /tmp/packer-vector-ctx.tar.gz -C deploy vector - name: Packer init - run: packer init deploy/packer/worker-ami-aws.pkr.hcl + run: packer init deploy/packer/worker-ami-aws-burst.pkr.hcl - name: Build and publish AMI run: | @@ -98,7 +98,11 @@ jobs: -var "agent_version=$AGENT_VERSION" \ -var "region=$AWS_REGION" \ -var "instance_type=$BUILDER_INSTANCE_TYPE" \ - deploy/packer/worker-ami-aws.pkr.hcl | tee /tmp/packer-output.txt + -var "tigris_endpoint=${{ secrets.TIGRIS_ENDPOINT }}" \ + -var "tigris_access_key_id=${{ secrets.TIGRIS_ACCESS_KEY_ID }}" \ + -var "tigris_secret_access_key=${{ secrets.TIGRIS_SECRET_ACCESS_KEY }}" \ + -var "tigris_goldens_bucket=${{ secrets.TIGRIS_GOLDENS_BUCKET }}" \ + deploy/packer/worker-ami-aws-burst.pkr.hcl | tee /tmp/packer-output.txt - name: Read AMI manifest id: ami @@ -186,6 +190,7 @@ jobs: echo "- **Worker version:** \`$VERSION\`" >> "$GITHUB_STEP_SUMMARY" echo "- **Agent version:** \`$AGENT_VERSION\`" >> "$GITHUB_STEP_SUMMARY" echo "- **Golden version:** \`${GOLDEN_VERSION:-unknown}\`" >> "$GITHUB_STEP_SUMMARY" + echo "- **Golden cache:** \`${{ secrets.TIGRIS_GOLDENS_BUCKET != '' && 'Tigris enabled' || 'disabled' }}\`" >> "$GITHUB_STEP_SUMMARY" echo "- **Region:** \`$AWS_REGION\`" >> "$GITHUB_STEP_SUMMARY" echo "- **SSM pointer:** \`$SSM_PARAMETER_PREFIX/worker-ami-id\`" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" diff --git a/deploy/packer/worker-ami-aws.pkr.hcl b/deploy/packer/worker-ami-aws-burst.pkr.hcl similarity index 64% rename from deploy/packer/worker-ami-aws.pkr.hcl rename to deploy/packer/worker-ami-aws-burst.pkr.hcl index 9a7b4edf..9f5b019f 100644 --- a/deploy/packer/worker-ami-aws.pkr.hcl +++ b/deploy/packer/worker-ami-aws-burst.pkr.hcl @@ -1,4 +1,4 @@ -# worker-ami-aws.pkr.hcl — Build an immutable AMI for OpenSandbox workers (QEMU backend) on AWS. +# worker-ami-aws-burst.pkr.hcl — Build an immutable AMI for OpenSandbox Burst workers on AWS. # # Mirrors deploy/packer/worker-ami.pkr.hcl (Azure variant) but targets the # amazon-ebs builder. The setup script (`deploy/azure/setup-azure-host.sh`) @@ -7,10 +7,9 @@ # # Differences from the Azure file: # - amazon-ebs source on Ubuntu 24.04 LTS x86_64 instead of azure-arm. -# - No rootfs blob caching (the Azure variant's elaborate Azure-blob cache -# dance was the only Azure-API touch; for the PoC we just rebuild the -# rootfs each time, ~10min extra per bake — acceptable for low rebuild -# frequency). +# - Optional Tigris/S3-compatible rootfs blob caching. Same rootfs inputs +# reuse the same cached default.ext4, which keeps AMI builds fast and +# golden versions stable when the guest image did not change. # - Installs awscli (needed by deploy/vector/populate-vector-env.sh AWS path # and by the worker user-data shared-disk attach). # - Tags the AMI for the terraform `aws_ami` data source lookup @@ -26,8 +25,8 @@ # tar czf /tmp/packer-rootfs-ctx.tar.gz deploy/firecracker/rootfs/ deploy/ec2/build-rootfs-docker.sh scripts/claude-agent-wrapper/ # # # 3. Run packer: -# packer init deploy/packer/worker-ami-aws.pkr.hcl -# packer build -var "worker_version=$(git rev-parse --short HEAD)" deploy/packer/worker-ami-aws.pkr.hcl +# packer init deploy/packer/worker-ami-aws-burst.pkr.hcl +# packer build -var "worker_version=$(git rev-parse --short HEAD)" deploy/packer/worker-ami-aws-burst.pkr.hcl # # # 4. The data source in opencomputer-infra/terraform/aws/us-east-2-poc/ami.tf # # picks up the new AMI on the next `tofu apply`. @@ -88,10 +87,30 @@ variable "vector_context" { description = "Pre-built tarball of deploy/vector/ (config + populator + units). Pre-create with: tar czf /tmp/packer-vector-ctx.tar.gz deploy/vector/" } -variable "golden_cache_bucket" { +variable "tigris_endpoint" { type = string default = "" - description = "Optional S3 bucket to upload the bake's golden default.ext4 to (under bases//). Cell-scoped — e.g. oc-aws-us-east-2-poc-golden-cache. Empty = skip upload." + description = "Optional S3-compatible endpoint for Tigris rootfs/golden cache." +} + +variable "tigris_access_key_id" { + type = string + default = "" + sensitive = true + description = "Optional Tigris access key for rootfs/golden cache." +} + +variable "tigris_secret_access_key" { + type = string + default = "" + sensitive = true + description = "Optional Tigris secret key for rootfs/golden cache." +} + +variable "tigris_goldens_bucket" { + type = string + default = "" + description = "Optional Tigris bucket for content-addressed rootfs cache and golden uploads. Empty = skip cache." } # --------------------------------------------------------------------- @@ -104,8 +123,8 @@ source "amazon-ebs" "worker" { ssh_username = "ubuntu" ssh_pty = true - ami_name = "opensandbox-worker-${var.worker_version}-${formatdate("YYYYMMDD-hhmm", timestamp())}" - ami_description = "OpenSandbox worker AMI (Ubuntu 24.04, QEMU/KVM nested-virt). Built from git ${var.worker_version}." + ami_name = "opensandbox-burst-worker-${var.worker_version}-${formatdate("YYYYMMDD-hhmm", timestamp())}" + ami_description = "OpenSandbox Burst worker AMI (Ubuntu 24.04, QEMU/KVM nested-virt). Built from git ${var.worker_version}." source_ami_filter { filters = { @@ -131,7 +150,7 @@ source "amazon-ebs" "worker" { # AMI tags — the terraform `aws_ami` data source in the AWS leaf filters # on these to pick the most-recent worker AMI for this cloud. tags = { - Name = "opensandbox-worker-${var.worker_version}" + Name = "opensandbox-burst-worker-${var.worker_version}" "opensandbox-role" = "worker" "opensandbox-cloud" = "aws" "opensandbox-version" = var.worker_version @@ -204,18 +223,30 @@ build { } # 6. AWS-specific: install awscli (used by populate-vector-env.sh and by - # the worker user-data's shared-disk attach), then install binaries and - # build the golden rootfs. + # the worker user-data's shared-disk attach), bake OCFS2 dependencies for + # the shared data volume, then install binaries and build the golden rootfs. provisioner "shell" { execute_command = "chmod +x {{ .Path }}; {{ .Vars }} sudo -E bash '{{ .Path }}'" + environment_vars = [ + "TIGRIS_ENDPOINT=${var.tigris_endpoint}", + "TIGRIS_ACCESS_KEY_ID=${var.tigris_access_key_id}", + "TIGRIS_SECRET_ACCESS_KEY=${var.tigris_secret_access_key}", + "TIGRIS_GOLDENS_BUCKET=${var.tigris_goldens_bucket}", + "AWS_DEFAULT_REGION=auto", + ] inline = [ # awscli v2 — apt's `awscli` is v1 and missing some commands we use. "apt-get update -qq", - "apt-get install -y -qq unzip", + "DEBIAN_FRONTEND=noninteractive apt-get install -y -qq unzip ocfs2-tools \"linux-modules-extra-$(uname -r)\"", "curl -fsSL 'https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip' -o /tmp/awscliv2.zip", "cd /tmp && unzip -q awscliv2.zip && ./aws/install --update", "rm -rf /tmp/awscliv2.zip /tmp/aws", "aws --version", + "modprobe ocfs2", + "modprobe ocfs2_dlmfs", + "modprobe ocfs2_stack_o2cb", + "command -v mount.ocfs2", + "systemctl disable --now apt-daily.timer apt-daily-upgrade.timer apt-daily.service apt-daily-upgrade.service 2>/dev/null || true", # Install worker + agent binaries. "mv /tmp/opensandbox-worker /usr/local/bin/opensandbox-worker", @@ -228,15 +259,33 @@ build { "systemctl daemon-reload", "systemctl enable opensandbox-worker.service", - # Build the golden rootfs (no caching for PoC — every bake builds from scratch). + # Build or restore the golden rootfs. The cache key is content-addressed + # from the guest agent, rootfs sources, and guest kernel modules. "mkdir -p /tmp/rootfs-ctx", "cd /tmp/rootfs-ctx && tar xzf /tmp/rootfs-ctx.tar.gz", "INPUT_HASH=$({ sha256sum /usr/local/bin/osb-agent; find /tmp/rootfs-ctx -type f | sort | xargs sha256sum; sha256sum /opt/opensandbox/guest-modules/*.ko* 2>/dev/null; } | sha256sum | awk '{print $1}')", "echo \"Rootfs input hash: $INPUT_HASH\"", "ROOTFS_UUID=$(echo \"$INPUT_HASH\" | head -c 32 | sed 's/\\(........\\)\\(....\\)\\(....\\)\\(....\\)\\(............\\)/\\1-\\2-\\3-\\4-\\5/')", "export ROOTFS_UUID", + "INPUT_HASH_SHORT=$(echo \"$INPUT_HASH\" | cut -c1-16)", + "CACHE_KEY=\"rootfs-cache/$INPUT_HASH_SHORT/default.ext4\"", + "CACHE_HIT=0", "mkdir -p /data/firecracker/images /opt/opensandbox/images", - "cd /tmp/rootfs-ctx && bash deploy/ec2/build-rootfs-docker.sh /usr/local/bin/osb-agent /data/firecracker/images default", + "if [ -n \"$TIGRIS_ENDPOINT\" ] && [ -n \"$TIGRIS_ACCESS_KEY_ID\" ] && [ -n \"$TIGRIS_SECRET_ACCESS_KEY\" ] && [ -n \"$TIGRIS_GOLDENS_BUCKET\" ]; then", + " export AWS_ACCESS_KEY_ID=\"$TIGRIS_ACCESS_KEY_ID\" AWS_SECRET_ACCESS_KEY=\"$TIGRIS_SECRET_ACCESS_KEY\"", + " echo \"Checking rootfs cache: s3://$TIGRIS_GOLDENS_BUCKET/$CACHE_KEY\"", + " if aws s3 cp --endpoint-url \"$TIGRIS_ENDPOINT\" \"s3://$TIGRIS_GOLDENS_BUCKET/$CACHE_KEY\" /data/firecracker/images/default.ext4; then", + " CACHE_HIT=1", + " echo 'Rootfs restored from cache — skipping Docker build'", + " else", + " echo 'Rootfs cache miss — building from source'", + " fi", + "else", + " echo 'Tigris cache credentials incomplete; rootfs cache disabled'", + "fi", + "if [ \"$CACHE_HIT\" != \"1\" ]; then", + " cd /tmp/rootfs-ctx && ROOTFS_UUID=\"$ROOTFS_UUID\" bash deploy/ec2/build-rootfs-docker.sh /usr/local/bin/osb-agent /data/firecracker/images default", + "fi", "cp /data/firecracker/images/default.ext4 /opt/opensandbox/images/default.ext4", # Inject guest kernel modules into rootfs. @@ -255,31 +304,36 @@ build { "GOLDEN_VERSION=$(/usr/local/bin/opensandbox-worker golden-version /opt/opensandbox/images/default.ext4 2>/dev/null || sha256sum /opt/opensandbox/images/default.ext4 | awk '{print $1}')", "echo \"$GOLDEN_VERSION\" > /opt/opensandbox/images/golden-version", "echo \"Golden version: $GOLDEN_VERSION\"", + "if [ \"$CACHE_HIT\" != \"1\" ] && [ -n \"$TIGRIS_ENDPOINT\" ] && [ -n \"$TIGRIS_ACCESS_KEY_ID\" ] && [ -n \"$TIGRIS_SECRET_ACCESS_KEY\" ] && [ -n \"$TIGRIS_GOLDENS_BUCKET\" ]; then", + " export AWS_ACCESS_KEY_ID=\"$TIGRIS_ACCESS_KEY_ID\" AWS_SECRET_ACCESS_KEY=\"$TIGRIS_SECRET_ACCESS_KEY\"", + " echo \"Uploading rootfs cache: s3://$TIGRIS_GOLDENS_BUCKET/$CACHE_KEY\"", + " aws s3 cp --endpoint-url \"$TIGRIS_ENDPOINT\" /opt/opensandbox/images/default.ext4 \"s3://$TIGRIS_GOLDENS_BUCKET/$CACHE_KEY\" || echo 'rootfs cache upload failed — continuing'", + "fi", ] } - # 7. Optional: upload the golden to S3 so the cell's shared-disk seeder + # 7. Optional: upload the golden to Tigris so future hydration paths # + future per-instance prefetch path can fetch it without rebuilding. provisioner "shell" { execute_command = "chmod +x {{ .Path }}; {{ .Vars }} sudo -E bash '{{ .Path }}'" environment_vars = [ - "GOLDEN_CACHE_BUCKET=${var.golden_cache_bucket}", - "AWS_DEFAULT_REGION=${var.region}", + "TIGRIS_ENDPOINT=${var.tigris_endpoint}", + "TIGRIS_ACCESS_KEY_ID=${var.tigris_access_key_id}", + "TIGRIS_SECRET_ACCESS_KEY=${var.tigris_secret_access_key}", + "TIGRIS_GOLDENS_BUCKET=${var.tigris_goldens_bucket}", + "AWS_DEFAULT_REGION=auto", ] inline = [ "set -e", - "if [ -z \"$GOLDEN_CACHE_BUCKET\" ]; then", - " echo 'No golden_cache_bucket set; skipping S3 upload (worker AMI still includes the baked golden)'", + "if [ -z \"$TIGRIS_ENDPOINT\" ] || [ -z \"$TIGRIS_ACCESS_KEY_ID\" ] || [ -z \"$TIGRIS_SECRET_ACCESS_KEY\" ] || [ -z \"$TIGRIS_GOLDENS_BUCKET\" ]; then", + " echo 'Tigris cache credentials incomplete; skipping golden upload (worker AMI still includes the baked golden)'", " exit 0", "fi", + "export AWS_ACCESS_KEY_ID=\"$TIGRIS_ACCESS_KEY_ID\" AWS_SECRET_ACCESS_KEY=\"$TIGRIS_SECRET_ACCESS_KEY\"", "GOLDEN_VERSION=$(cat /opt/opensandbox/images/golden-version)", "S3_KEY=\"bases/$GOLDEN_VERSION/default.ext4\"", - "echo \"Uploading default.ext4 → s3://$GOLDEN_CACHE_BUCKET/$S3_KEY (~4GB, will take a moment)\"", - # Instance profile credentials — the bake runs on an EC2 instance and - # picks up its role via the metadata service. If the builder role - # doesn't have s3:PutObject on the cell's bucket, the upload fails - # gracefully and the AMI still works (just without S3-side hydration). - "aws s3 cp /opt/opensandbox/images/default.ext4 \"s3://$GOLDEN_CACHE_BUCKET/$S3_KEY\" || echo 'S3 upload failed — continuing (AMI golden is the only copy)'", + "echo \"Uploading default.ext4 -> s3://$TIGRIS_GOLDENS_BUCKET/$S3_KEY (~4GB, will take a moment)\"", + "aws s3 cp --endpoint-url \"$TIGRIS_ENDPOINT\" /opt/opensandbox/images/default.ext4 \"s3://$TIGRIS_GOLDENS_BUCKET/$S3_KEY\" || echo 'Tigris upload failed — continuing (AMI golden is the only copy)'", ] } diff --git a/deploy/vector/populate-vector-env.sh b/deploy/vector/populate-vector-env.sh index 26ac1672..80c05b63 100755 --- a/deploy/vector/populate-vector-env.sh +++ b/deploy/vector/populate-vector-env.sh @@ -174,7 +174,7 @@ aws) exit 0 fi if ! command -v aws >/dev/null 2>&1; then - log "aws CLI not installed in AMI — populator can't fetch from Secrets Manager. Bake awscli into the worker image (see deploy/packer/worker-ami-aws.pkr.hcl)." + log "aws CLI not installed in AMI — populator can't fetch from Secrets Manager. Bake awscli into the worker image (see deploy/packer/worker-ami-aws-burst.pkr.hcl)." exit 0 fi # Auto-detect region from IMDSv2 so we don't have to plumb it via env. diff --git a/internal/compute/ec2.go b/internal/compute/ec2.go index 9a6e80ee..14a29035 100644 --- a/internal/compute/ec2.go +++ b/internal/compute/ec2.go @@ -467,6 +467,8 @@ func (p *EC2Pool) buildUserData(opts MachineOpts) string { _ = opts // opts.Region/Size honored at instance launch; cloud-init is cell-uniform var sb strings.Builder sb.WriteString("#!/bin/bash\nset -euo pipefail\n\n") + sb.WriteString("oc_boot_log() { echo \"opensandbox-worker-bootstrap $(date -Is) $*\"; }\n") + sb.WriteString("oc_boot_log 'user-data start'\n\n") sb.WriteString("systemctl stop opensandbox-worker.service 2>/dev/null || true\n") sb.WriteString("systemctl disable opensandbox-worker.service 2>/dev/null || true\n") sb.WriteString("systemctl reset-failed opensandbox-worker.service 2>/dev/null || true\n\n") @@ -479,6 +481,7 @@ func (p *EC2Pool) buildUserData(opts MachineOpts) string { sb.WriteString("MY_IP=$(curl -fsS -H \"X-aws-ec2-metadata-token: $TOKEN\" http://169.254.169.254/latest/meta-data/local-ipv4)\n") sb.WriteString("INSTANCE_ID=$(curl -fsS -H \"X-aws-ec2-metadata-token: $TOKEN\" http://169.254.169.254/latest/meta-data/instance-id)\n") sb.WriteString("WORKER_ID=\"w-aws-${INSTANCE_ID}\"\n\n") + sb.WriteString("oc_boot_log \"instance identity ready: $INSTANCE_ID $MY_IP\"\n\n") // NVMe instance store handling. Larger metal/x.gd instance families expose // multiple NVMe drives at /dev/nvme[1-N]n1; smaller instances rely on EBS @@ -505,6 +508,7 @@ func (p *EC2Pool) buildUserData(opts MachineOpts) string { sb.WriteString(" fi\n") sb.WriteString("fi\n") sb.WriteString("mkdir -p /data/sandboxes /data/firecracker/images\n") + sb.WriteString("oc_boot_log 'base data mount ready'\n\n") if p.cfg.SharedSandboxDataVolumeID != "" { sb.WriteString(p.sharedSandboxDataUserData()) @@ -515,6 +519,7 @@ func (p *EC2Pool) buildUserData(opts MachineOpts) string { sb.WriteString("# Copy AMI-baked rootfs images to data disk if not already present\n") sb.WriteString("if [ -d /opt/opensandbox/images ] && [ ! -f /data/firecracker/images/default.ext4 ]; then\n") + sb.WriteString(" oc_boot_log 'copying AMI-baked rootfs to data disk'\n") sb.WriteString(" cp /opt/opensandbox/images/*.ext4 /data/firecracker/images/ 2>/dev/null || true\n") sb.WriteString("fi\n") sb.WriteString("if [ -d /opt/opensandbox/images/bases ] && [ ! -d /data/firecracker/images/bases ]; then\n") @@ -542,7 +547,9 @@ func (p *EC2Pool) buildUserData(opts MachineOpts) string { sb.WriteString("rm -rf /data/sandboxes/golden-snapshot /data/sandboxes/golden\n\n") // Start worker + sb.WriteString("oc_boot_log 'starting opensandbox-worker service'\n") sb.WriteString("systemctl restart opensandbox-worker\n") + sb.WriteString("oc_boot_log 'user-data complete'\n") return sb.String() } @@ -566,18 +573,16 @@ func (p *EC2Pool) sharedSandboxDataUserData() string { var sb strings.Builder sb.WriteString("# Shared sandbox data: OCFS2 over io2 Multi-Attach\n") - sb.WriteString("if ! command -v mount.ocfs2 >/dev/null 2>&1; then\n") - sb.WriteString(" for i in $(seq 1 120); do\n") - sb.WriteString(" fuser /var/lib/dpkg/lock-frontend >/dev/null 2>&1 || fuser /var/lib/dpkg/lock >/dev/null 2>&1 || fuser /var/lib/apt/lists/lock >/dev/null 2>&1 || break\n") - sb.WriteString(" sleep 2\n") - sb.WriteString(" done\n") - sb.WriteString(" apt-get update\n") - sb.WriteString(" DEBIAN_FRONTEND=noninteractive apt-get install -y ocfs2-tools \"linux-modules-extra-$(uname -r)\"\n") - sb.WriteString("fi\n") + sb.WriteString("oc_boot_log 'validating baked OCFS2 dependencies'\n") + sb.WriteString("command -v mount.ocfs2 >/dev/null 2>&1 || { echo 'ERROR: AMI missing ocfs2-tools; rebuild worker AMI'; exit 1; }\n") + sb.WriteString("modprobe ocfs2 || { echo 'ERROR: AMI missing ocfs2 kernel module; rebuild worker AMI with linux-modules-extra'; exit 1; }\n") + sb.WriteString("modprobe ocfs2_dlmfs || { echo 'ERROR: AMI missing ocfs2_dlmfs kernel module; rebuild worker AMI'; exit 1; }\n") + sb.WriteString("modprobe ocfs2_stack_o2cb || { echo 'ERROR: AMI missing ocfs2_stack_o2cb kernel module; rebuild worker AMI'; exit 1; }\n") sb.WriteString(fmt.Sprintf("SANDBOX_VOLUME_ID=%q\n", p.cfg.SharedSandboxDataVolumeID)) sb.WriteString(fmt.Sprintf("OCFS2_CLUSTER_NAME=%q\n", clusterName)) sb.WriteString(fmt.Sprintf("OCFS2_EXPECTED_NODES=%d\n", expectedNodes)) sb.WriteString(fmt.Sprintf("OCFS2_MAX_NODES=%d\n", maxNodes)) + sb.WriteString("oc_boot_log \"attaching shared sandbox data volume $SANDBOX_VOLUME_ID\"\n") sb.WriteString("aws ec2 attach-volume --region " + shellQuote(p.cfg.Region) + " --volume-id \"$SANDBOX_VOLUME_ID\" --instance-id \"$INSTANCE_ID\" --device /dev/sdg || true\n") sb.WriteString("SANDBOX_DEV=\"\"\n") sb.WriteString("SANDBOX_VOL_NO_DASH=\"${SANDBOX_VOLUME_ID//-/}\"\n") @@ -596,6 +601,8 @@ func (p *EC2Pool) sharedSandboxDataUserData() string { sb.WriteString("SANDBOX_SERIAL=$(lsblk -dn -o SERIAL \"$SANDBOX_DEV\" 2>/dev/null | head -1 || true)\n") sb.WriteString("if [ \"$SANDBOX_SERIAL\" != \"$SANDBOX_VOL_NO_DASH\" ]; then echo \"ERROR: $SANDBOX_DEV serial $SANDBOX_SERIAL does not match sandbox volume $SANDBOX_VOLUME_ID\"; lsblk -o NAME,MODEL,SERIAL,SIZE,FSTYPE,MOUNTPOINT || true; exit 1; fi\n") sb.WriteString("echo \"Using shared sandbox data volume $SANDBOX_VOLUME_ID at $SANDBOX_DEV\"\n") + sb.WriteString("oc_boot_log \"shared sandbox data volume visible at $SANDBOX_DEV\"\n") + sb.WriteString("oc_boot_log 'discovering OCFS2 peer nodes'\n") sb.WriteString("mapfile -t OCFS2_NODES < <(for i in $(seq 1 60); do aws ec2 describe-instances --region " + shellQuote(p.cfg.Region) + " --filters \"Name=tag:Cell,Values=" + shellEscapedDouble(p.cfg.CellID) + "\" \"Name=tag:Role,Values=worker\" \"Name=instance-state-name,Values=running\" --query 'Reservations[].Instances[].PrivateDnsName' --output text | tr '\\t' '\\n' | awk 'NF { sub(/\\..*/, \"\", $0); print }' | sort -u; break; done)\n") sb.WriteString("for i in $(seq 1 60); do\n") sb.WriteString(" [ \"${#OCFS2_NODES[@]}\" -ge \"$OCFS2_EXPECTED_NODES\" ] && break\n") @@ -603,11 +610,13 @@ func (p *EC2Pool) sharedSandboxDataUserData() string { sb.WriteString(" mapfile -t OCFS2_NODES < <(aws ec2 describe-instances --region " + shellQuote(p.cfg.Region) + " --filters \"Name=tag:Cell,Values=" + shellEscapedDouble(p.cfg.CellID) + "\" \"Name=tag:Role,Values=worker\" \"Name=instance-state-name,Values=running\" --query 'Reservations[].Instances[].PrivateDnsName' --output text | tr '\\t' '\\n' | awk 'NF { sub(/\\..*/, \"\", $0); print }' | sort -u)\n") sb.WriteString("done\n") sb.WriteString("if [ \"${#OCFS2_NODES[@]}\" -lt \"$OCFS2_EXPECTED_NODES\" ]; then echo \"ERROR: found ${#OCFS2_NODES[@]} OCFS2 nodes, expected $OCFS2_EXPECTED_NODES\"; exit 1; fi\n") + sb.WriteString("oc_boot_log \"OCFS2 peer nodes: ${OCFS2_NODES[*]}\"\n") sb.WriteString("install -d -m 0755 /etc/ocfs2 /etc/sysconfig\n") sb.WriteString("{ echo \"cluster:\"; echo \" node_count = ${#OCFS2_NODES[@]}\"; echo \" name = $OCFS2_CLUSTER_NAME\"; echo \"\"; n=0; for node in \"${OCFS2_NODES[@]}\"; do ip=$(getent ahostsv4 \"$node\" | awk '{print $1; exit}'); [ -n \"${ip:-}\" ] || { echo \"ERROR: could not resolve OCFS2 node $node\"; exit 1; }; echo \"node:\"; echo \" ip_port = 7777\"; echo \" ip_address = $ip\"; echo \" number = $n\"; echo \" name = $node\"; echo \" cluster = $OCFS2_CLUSTER_NAME\"; echo \"\"; n=$((n + 1)); done; } > /etc/ocfs2/cluster.conf\n") sb.WriteString("cat > /etc/default/o2cb </dev/null 2>&1 && o2cb register-cluster \"$OCFS2_CLUSTER_NAME\" || true\n") sb.WriteString("[ -x /etc/init.d/o2cb ] && /etc/init.d/o2cb online \"$OCFS2_CLUSTER_NAME\" || true\n") sb.WriteString("mkdir -p /data/sandboxes\n") @@ -616,7 +625,9 @@ func (p *EC2Pool) sharedSandboxDataUserData() string { sb.WriteString("FSTYPE=$(blkid -s TYPE -o value \"$SANDBOX_DEV\" 2>/dev/null || true)\n") sb.WriteString("if [ \"$FSTYPE\" != \"ocfs2\" ]; then echo \"ERROR: shared sandbox data volume $SANDBOX_DEV has filesystem '$FSTYPE', expected ocfs2\"; lsblk -o NAME,MODEL,SERIAL,SIZE,FSTYPE,MOUNTPOINT || true; exit 1; fi\n") sb.WriteString("if ! grep -q 'LABEL=opensandbox-sandboxes' /etc/fstab; then echo 'LABEL=opensandbox-sandboxes /data/sandboxes ocfs2 noauto,_netdev,noatime 0 0' >> /etc/fstab; fi\n") + sb.WriteString("oc_boot_log 'mounting OCFS2 shared sandbox data volume'\n") sb.WriteString("timeout 90 mount -t ocfs2 -o noatime \"$SANDBOX_DEV\" /data/sandboxes\n") + sb.WriteString("oc_boot_log 'OCFS2 shared sandbox data mounted'\n") sb.WriteString("chown root:root /data/sandboxes\n\n") return sb.String() } From 9d7553dafa0a88b1a87c13478c3f21c53968a275 Mon Sep 17 00:00:00 2001 From: Mohamed Habib Date: Wed, 10 Jun 2026 12:36:16 -0700 Subject: [PATCH 31/32] Use static OCFS2 slots for AWS workers --- cmd/server/main.go | 1 + deploy/server.env.example | 1 + internal/compute/ec2.go | 137 ++++++++++++++++++++++++++--- internal/compute/ec2_ocfs2_test.go | 48 ++++++++++ internal/config/config.go | 2 + internal/config/secrets.go | 65 ++++++++------ 6 files changed, 211 insertions(+), 43 deletions(-) create mode 100644 internal/compute/ec2_ocfs2_test.go diff --git a/cmd/server/main.go b/cmd/server/main.go index 8437a9a3..9d559121 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -460,6 +460,7 @@ func main() { OCFS2ClusterName: cfg.EC2OCFS2ClusterName, OCFS2ExpectedNodes: cfg.EC2OCFS2ExpectedNodes, OCFS2MaxNodes: cfg.EC2OCFS2MaxNodes, + OCFS2NodeIPs: cfg.EC2OCFS2NodeIPs, }) if err != nil { log.Fatalf("opensandbox: failed to create EC2 pool: %v", err) diff --git a/deploy/server.env.example b/deploy/server.env.example index f441193e..06b62187 100644 --- a/deploy/server.env.example +++ b/deploy/server.env.example @@ -52,3 +52,4 @@ OPENSANDBOX_SENTRY_ENVIRONMENT=production # server-stripe-secret-key → STRIPE_SECRET_KEY # server-stripe-webhook-secret → STRIPE_WEBHOOK_SECRET # server-sentry-dsn → OPENSANDBOX_SENTRY_DSN +# server-ocfs2-node-ips → OPENSANDBOX_OCFS2_NODE_IPS diff --git a/internal/compute/ec2.go b/internal/compute/ec2.go index 14a29035..0d11500c 100644 --- a/internal/compute/ec2.go +++ b/internal/compute/ec2.go @@ -6,6 +6,7 @@ import ( "errors" "fmt" "log" + "strconv" "strings" "sync" @@ -66,6 +67,8 @@ const ( awsTagCell = "opensandbox:cell" awsTagInstanceType = "opensandbox:instance-type" awsTagDraining = "opensandbox:draining" + awsTagOCFS2Slot = "opensandbox:ocfs2-slot" + awsTagOCFS2IP = "opensandbox:ocfs2-ip" awsTagWorker = "worker" ) @@ -89,6 +92,14 @@ type EC2PoolConfig struct { OCFS2ClusterName string OCFS2ExpectedNodes int OCFS2MaxNodes int + OCFS2NodeIPs []string // fixed private IPs, one per OCFS2 node slot +} + +type ocfs2Assignment struct { + Enabled bool + Slot int + IP string + NodeIPs []string } // EC2Pool implements compute.Pool using AWS EC2 instances. @@ -168,7 +179,12 @@ func (p *EC2Pool) CreateMachine(ctx context.Context, opts MachineOpts) (*Machine return nil, fmt.Errorf("ec2: no AMI set (configure AMI or SSMParameterName)") } - userData := p.buildUserData(opts) + ocfs2, err := p.allocateOCFS2Slot(ctx) + if err != nil { + return nil, err + } + + userData := p.buildUserData(opts, ocfs2) machineName := fmt.Sprintf("osb-worker-%s", randomSuffix()) instanceTags := []ec2types.Tag{ {Key: aws.String("Name"), Value: aws.String(machineName)}, @@ -176,6 +192,12 @@ func (p *EC2Pool) CreateMachine(ctx context.Context, opts MachineOpts) (*Machine {Key: aws.String(awsTagRole), Value: aws.String(awsTagWorker)}, {Key: aws.String(awsTagInstanceType), Value: aws.String(instanceType)}, } + if ocfs2.Enabled { + instanceTags = append(instanceTags, + ec2types.Tag{Key: aws.String(awsTagOCFS2Slot), Value: aws.String(strconv.Itoa(ocfs2.Slot))}, + ec2types.Tag{Key: aws.String(awsTagOCFS2IP), Value: aws.String(ocfs2.IP)}, + ) + } volumeTags := []ec2types.Tag{ {Key: aws.String(awsTagRole), Value: aws.String(awsTagWorker)}, } @@ -229,6 +251,9 @@ func (p *EC2Pool) CreateMachine(ctx context.Context, opts MachineOpts) (*Machine if p.cfg.SecurityGroupID != "" { input.SecurityGroupIds = []string{p.cfg.SecurityGroupID} } + if ocfs2.Enabled { + input.PrivateIpAddress = aws.String(ocfs2.IP) + } if p.cfg.KeyName != "" { input.KeyName = aws.String(p.cfg.KeyName) } @@ -331,6 +356,67 @@ func (p *EC2Pool) DrainMachine(ctx context.Context, machineID string) error { return nil } +func (p *EC2Pool) allocateOCFS2Slot(ctx context.Context) (ocfs2Assignment, error) { + if p.cfg.SharedSandboxDataVolumeID == "" || len(p.cfg.OCFS2NodeIPs) == 0 { + return ocfs2Assignment{}, nil + } + + used := make(map[int]bool, len(p.cfg.OCFS2NodeIPs)) + ipToSlot := make(map[string]int, len(p.cfg.OCFS2NodeIPs)) + for i, ip := range p.cfg.OCFS2NodeIPs { + ip = strings.TrimSpace(ip) + if ip == "" { + return ocfs2Assignment{}, fmt.Errorf("ec2: OCFS2 node IP slot %d is empty", i) + } + ipToSlot[ip] = i + } + + filters := []ec2types.Filter{ + {Name: aws.String("tag:" + awsTagRole), Values: []string{awsTagWorker}}, + {Name: aws.String("instance-state-name"), Values: []string{"pending", "running", "stopping", "stopped"}}, + } + if p.cfg.CellID != "" { + filters = append(filters, ec2types.Filter{Name: aws.String("tag:" + awsTagCell), Values: []string{p.cfg.CellID}}) + } + + result, err := p.client.DescribeInstances(ctx, &ec2.DescribeInstancesInput{Filters: filters}) + if err != nil { + return ocfs2Assignment{}, fmt.Errorf("ec2: describe workers for OCFS2 slot allocation: %w", err) + } + for _, res := range result.Reservations { + for _, inst := range res.Instances { + if inst.PrivateIpAddress != nil { + if slot, ok := ipToSlot[aws.ToString(inst.PrivateIpAddress)]; ok { + used[slot] = true + } + } + for _, tag := range inst.Tags { + if aws.ToString(tag.Key) != awsTagOCFS2Slot { + continue + } + slot, convErr := strconv.Atoi(aws.ToString(tag.Value)) + if convErr == nil && slot >= 0 && slot < len(p.cfg.OCFS2NodeIPs) { + used[slot] = true + } + } + } + } + + for slot, ip := range p.cfg.OCFS2NodeIPs { + if used[slot] { + continue + } + return ocfs2Assignment{ + Enabled: true, + Slot: slot, + IP: strings.TrimSpace(ip), + NodeIPs: append([]string(nil), p.cfg.OCFS2NodeIPs...), + }, nil + } + + return ocfs2Assignment{}, fmt.Errorf("ec2: no free OCFS2 node slots available (%d configured)", len(p.cfg.OCFS2NodeIPs)) +} + // CleanupOrphanedResources reclaims ENIs and EBS volumes left by failed // VM creates. Mirrors the AzurePool's NIC/disk cleanup. // @@ -463,7 +549,7 @@ func (p *EC2Pool) instanceToMachine(inst *ec2types.Instance) *Machine { // buildUserData returns the EC2 instance user-data script. Combines the // CP-supplied WorkerSpec with EC2-specific cloud-init (NVMe instance-store // mount, AMI-baked rootfs copy, machine-id stamping). -func (p *EC2Pool) buildUserData(opts MachineOpts) string { +func (p *EC2Pool) buildUserData(opts MachineOpts, ocfs2 ocfs2Assignment) string { _ = opts // opts.Region/Size honored at instance launch; cloud-init is cell-uniform var sb strings.Builder sb.WriteString("#!/bin/bash\nset -euo pipefail\n\n") @@ -511,7 +597,7 @@ func (p *EC2Pool) buildUserData(opts MachineOpts) string { sb.WriteString("oc_boot_log 'base data mount ready'\n\n") if p.cfg.SharedSandboxDataVolumeID != "" { - sb.WriteString(p.sharedSandboxDataUserData()) + sb.WriteString(p.sharedSandboxDataUserData(ocfs2)) } if p.cfg.SharedGoldensVolumeID != "" { sb.WriteString(p.sharedGoldensUserData()) @@ -554,7 +640,7 @@ func (p *EC2Pool) buildUserData(opts MachineOpts) string { return sb.String() } -func (p *EC2Pool) sharedSandboxDataUserData() string { +func (p *EC2Pool) sharedSandboxDataUserData(ocfs2 ocfs2Assignment) string { clusterName := p.cfg.OCFS2ClusterName if clusterName == "" { clusterName = "opensandbox" @@ -567,6 +653,9 @@ func (p *EC2Pool) sharedSandboxDataUserData() string { if maxNodes <= 0 { maxNodes = expectedNodes } + if ocfs2.Enabled && maxNodes < len(ocfs2.NodeIPs) { + maxNodes = len(ocfs2.NodeIPs) + } if maxNodes < expectedNodes { maxNodes = expectedNodes } @@ -582,6 +671,10 @@ func (p *EC2Pool) sharedSandboxDataUserData() string { sb.WriteString(fmt.Sprintf("OCFS2_CLUSTER_NAME=%q\n", clusterName)) sb.WriteString(fmt.Sprintf("OCFS2_EXPECTED_NODES=%d\n", expectedNodes)) sb.WriteString(fmt.Sprintf("OCFS2_MAX_NODES=%d\n", maxNodes)) + if ocfs2.Enabled { + sb.WriteString(fmt.Sprintf("OCFS2_NODE_SLOT=%d\n", ocfs2.Slot)) + sb.WriteString(fmt.Sprintf("OCFS2_NODE_IP=%q\n", ocfs2.IP)) + } sb.WriteString("oc_boot_log \"attaching shared sandbox data volume $SANDBOX_VOLUME_ID\"\n") sb.WriteString("aws ec2 attach-volume --region " + shellQuote(p.cfg.Region) + " --volume-id \"$SANDBOX_VOLUME_ID\" --instance-id \"$INSTANCE_ID\" --device /dev/sdg || true\n") sb.WriteString("SANDBOX_DEV=\"\"\n") @@ -602,17 +695,29 @@ func (p *EC2Pool) sharedSandboxDataUserData() string { sb.WriteString("if [ \"$SANDBOX_SERIAL\" != \"$SANDBOX_VOL_NO_DASH\" ]; then echo \"ERROR: $SANDBOX_DEV serial $SANDBOX_SERIAL does not match sandbox volume $SANDBOX_VOLUME_ID\"; lsblk -o NAME,MODEL,SERIAL,SIZE,FSTYPE,MOUNTPOINT || true; exit 1; fi\n") sb.WriteString("echo \"Using shared sandbox data volume $SANDBOX_VOLUME_ID at $SANDBOX_DEV\"\n") sb.WriteString("oc_boot_log \"shared sandbox data volume visible at $SANDBOX_DEV\"\n") - sb.WriteString("oc_boot_log 'discovering OCFS2 peer nodes'\n") - sb.WriteString("mapfile -t OCFS2_NODES < <(for i in $(seq 1 60); do aws ec2 describe-instances --region " + shellQuote(p.cfg.Region) + " --filters \"Name=tag:Cell,Values=" + shellEscapedDouble(p.cfg.CellID) + "\" \"Name=tag:Role,Values=worker\" \"Name=instance-state-name,Values=running\" --query 'Reservations[].Instances[].PrivateDnsName' --output text | tr '\\t' '\\n' | awk 'NF { sub(/\\..*/, \"\", $0); print }' | sort -u; break; done)\n") - sb.WriteString("for i in $(seq 1 60); do\n") - sb.WriteString(" [ \"${#OCFS2_NODES[@]}\" -ge \"$OCFS2_EXPECTED_NODES\" ] && break\n") - sb.WriteString(" sleep 2\n") - sb.WriteString(" mapfile -t OCFS2_NODES < <(aws ec2 describe-instances --region " + shellQuote(p.cfg.Region) + " --filters \"Name=tag:Cell,Values=" + shellEscapedDouble(p.cfg.CellID) + "\" \"Name=tag:Role,Values=worker\" \"Name=instance-state-name,Values=running\" --query 'Reservations[].Instances[].PrivateDnsName' --output text | tr '\\t' '\\n' | awk 'NF { sub(/\\..*/, \"\", $0); print }' | sort -u)\n") - sb.WriteString("done\n") - sb.WriteString("if [ \"${#OCFS2_NODES[@]}\" -lt \"$OCFS2_EXPECTED_NODES\" ]; then echo \"ERROR: found ${#OCFS2_NODES[@]} OCFS2 nodes, expected $OCFS2_EXPECTED_NODES\"; exit 1; fi\n") - sb.WriteString("oc_boot_log \"OCFS2 peer nodes: ${OCFS2_NODES[*]}\"\n") sb.WriteString("install -d -m 0755 /etc/ocfs2 /etc/sysconfig\n") - sb.WriteString("{ echo \"cluster:\"; echo \" node_count = ${#OCFS2_NODES[@]}\"; echo \" name = $OCFS2_CLUSTER_NAME\"; echo \"\"; n=0; for node in \"${OCFS2_NODES[@]}\"; do ip=$(getent ahostsv4 \"$node\" | awk '{print $1; exit}'); [ -n \"${ip:-}\" ] || { echo \"ERROR: could not resolve OCFS2 node $node\"; exit 1; }; echo \"node:\"; echo \" ip_port = 7777\"; echo \" ip_address = $ip\"; echo \" number = $n\"; echo \" name = $node\"; echo \" cluster = $OCFS2_CLUSTER_NAME\"; echo \"\"; n=$((n + 1)); done; } > /etc/ocfs2/cluster.conf\n") + if ocfs2.Enabled { + sb.WriteString("oc_boot_log \"using static OCFS2 slot $OCFS2_NODE_SLOT at $OCFS2_NODE_IP\"\n") + sb.WriteString("OCFS2_NODE_NAMES=()\n") + sb.WriteString("OCFS2_NODE_IPS=()\n") + for _, ip := range ocfs2.NodeIPs { + ip = strings.TrimSpace(ip) + sb.WriteString(fmt.Sprintf("OCFS2_NODE_NAMES+=(%q)\n", awsPrivateDNSShortName(ip))) + sb.WriteString(fmt.Sprintf("OCFS2_NODE_IPS+=(%q)\n", ip)) + } + sb.WriteString("{ echo \"cluster:\"; echo \" node_count = ${#OCFS2_NODE_IPS[@]}\"; echo \" name = $OCFS2_CLUSTER_NAME\"; echo \"\"; for i in \"${!OCFS2_NODE_IPS[@]}\"; do echo \"node:\"; echo \" ip_port = 7777\"; echo \" ip_address = ${OCFS2_NODE_IPS[$i]}\"; echo \" number = $i\"; echo \" name = ${OCFS2_NODE_NAMES[$i]}\"; echo \" cluster = $OCFS2_CLUSTER_NAME\"; echo \"\"; done; } > /etc/ocfs2/cluster.conf\n") + } else { + sb.WriteString("oc_boot_log 'discovering OCFS2 peer nodes'\n") + sb.WriteString("mapfile -t OCFS2_NODES < <(for i in $(seq 1 60); do aws ec2 describe-instances --region " + shellQuote(p.cfg.Region) + " --filters \"Name=tag:Cell,Values=" + shellEscapedDouble(p.cfg.CellID) + "\" \"Name=tag:Role,Values=worker\" \"Name=instance-state-name,Values=running\" --query 'Reservations[].Instances[].PrivateDnsName' --output text | tr '\\t' '\\n' | awk 'NF { sub(/\\..*/, \"\", $0); print }' | sort -u; break; done)\n") + sb.WriteString("for i in $(seq 1 60); do\n") + sb.WriteString(" [ \"${#OCFS2_NODES[@]}\" -ge \"$OCFS2_EXPECTED_NODES\" ] && break\n") + sb.WriteString(" sleep 2\n") + sb.WriteString(" mapfile -t OCFS2_NODES < <(aws ec2 describe-instances --region " + shellQuote(p.cfg.Region) + " --filters \"Name=tag:Cell,Values=" + shellEscapedDouble(p.cfg.CellID) + "\" \"Name=tag:Role,Values=worker\" \"Name=instance-state-name,Values=running\" --query 'Reservations[].Instances[].PrivateDnsName' --output text | tr '\\t' '\\n' | awk 'NF { sub(/\\..*/, \"\", $0); print }' | sort -u)\n") + sb.WriteString("done\n") + sb.WriteString("if [ \"${#OCFS2_NODES[@]}\" -lt \"$OCFS2_EXPECTED_NODES\" ]; then echo \"ERROR: found ${#OCFS2_NODES[@]} OCFS2 nodes, expected $OCFS2_EXPECTED_NODES\"; exit 1; fi\n") + sb.WriteString("oc_boot_log \"OCFS2 peer nodes: ${OCFS2_NODES[*]}\"\n") + sb.WriteString("{ echo \"cluster:\"; echo \" node_count = ${#OCFS2_NODES[@]}\"; echo \" name = $OCFS2_CLUSTER_NAME\"; echo \"\"; n=0; for node in \"${OCFS2_NODES[@]}\"; do ip=$(getent ahostsv4 \"$node\" | awk '{print $1; exit}'); [ -n \"${ip:-}\" ] || { echo \"ERROR: could not resolve OCFS2 node $node\"; exit 1; }; echo \"node:\"; echo \" ip_port = 7777\"; echo \" ip_address = $ip\"; echo \" number = $n\"; echo \" name = $node\"; echo \" cluster = $OCFS2_CLUSTER_NAME\"; echo \"\"; n=$((n + 1)); done; } > /etc/ocfs2/cluster.conf\n") + } sb.WriteString("cat > /etc/default/o2cb < Date: Mon, 15 Jun 2026 15:41:19 -0700 Subject: [PATCH 32/32] feat: prepare burst workers cold-ready --- .agents/design/burst-worker-cold-ready.md | 131 +++++++++++ cmd/server/main.go | 25 ++- internal/config/config.go | 4 + internal/config/secrets.go | 2 + internal/controlplane/scaler.go | 163 +++++++++++--- internal/controlplane/scaler_test.go | 205 ++++++++++++++++++ internal/qemu/manager.go | 19 +- internal/worker/grpc_server.go | 40 ++++ .../05-burst-edge-routing-smoke.ts | 196 +++++++++++++++++ 9 files changed, 740 insertions(+), 45 deletions(-) create mode 100644 .agents/design/burst-worker-cold-ready.md create mode 100644 scripts/integration-tests/05-burst-edge-routing-smoke.ts diff --git a/.agents/design/burst-worker-cold-ready.md b/.agents/design/burst-worker-cold-ready.md new file mode 100644 index 00000000..55168109 --- /dev/null +++ b/.agents/design/burst-worker-cold-ready.md @@ -0,0 +1,131 @@ +# Burst Worker Cold-Ready Startup Plan + +## Context + +The burst worker launch test on June 10, 2026 showed two different timing +segments: + +- EC2 instance creation to worker service start was roughly 90 seconds. +- Worker service start to control-plane registration was much longer because + startup blocked on `PrepareGoldenSnapshot`. + +The important observation is that the worker can be useful for cold boots +before the golden snapshot is ready. The current startup path does not expose +that intermediate state because the worker prepares the golden snapshot before +starting its servers and heartbeat. + +## Goal + +Make a newly launched burst worker register as soon as it is cold-boot capable, +while preparing the golden snapshot in the background. + +Target behavior: + +- Worker becomes schedulable for cold boots as soon as networking, env, shared + mounts, gRPC, HTTP, and Redis heartbeat are ready. +- Golden snapshot preparation continues asynchronously. +- Once the golden snapshot is ready, the worker heartbeat advertises the golden + version and the control plane can prefer it for fast creates. + +This does not remove EC2 launch latency. It removes golden snapshot creation +from the critical path for worker registration. + +## Proposed Changes + +1. Move golden snapshot preparation out of the blocking worker startup path. + + Today `cmd/worker/main.go` calls `PrepareGoldenSnapshot()` before starting + metadata, HTTP/gRPC, and Redis heartbeat. Move this after server startup and + heartbeat setup, running in a background goroutine. + +2. Register the worker as cold-ready first. + + Heartbeat should be published with no `golden_version` until the snapshot is + ready. The control plane already treats empty `golden_version` as "no golden + snapshot available"; keep that meaning. + +3. Update heartbeat when golden prep completes. + + After background `PrepareGoldenSnapshot()` succeeds, call + `hb.SetGoldenVersion(qemuMgr.GoldenVersion())`. The next heartbeat should + update the registry. + +4. Add explicit logs for readiness phases. + + Suggested log points: + + - `worker cold-ready: starting heartbeat before golden snapshot` + - `worker golden snapshot preparation started in background` + - `worker golden-ready: version=` + - `worker golden preparation failed: ; continuing cold-ready` + +5. Fix AMI/systemd ordering for burst workers. + + The burst AMI currently enables `opensandbox-worker.service`, so systemd can + start it before user-data writes `/etc/opensandbox/worker.env`. That caused + repeated `Failed to load environment files` messages during boot. + + Change the burst Packer file to install the worker unit but leave it + disabled. User-data should start the worker exactly once after: + + - instance identity is known + - shared volumes are attached/mounted + - `/etc/opensandbox/worker.env` has been written and patched + +6. Keep user-data minimal. + + User-data should only do runtime-specific work: + + - fetch instance identity + - attach/mount shared volumes + - write env + - start worker + + Dependency installation, binaries, OCFS2 tools, AWS CLI, QEMU, kernel + modules, and rootfs assets should stay baked into the AMI. + +## Non-Goals + +- Do not change Spot instance type fallback strategy yet. +- Do not try to guarantee sub-10-second readiness from a brand-new EC2 launch. +- Do not implement downloaded/prebuilt QEMU memory snapshots in this pass. +- Do not change public API behavior. + +## Expected Impact + +Based on the June 10 test: + +- Current EC2-created-to-registered time was about 6 minutes 24 seconds. +- Worker service started about 91 seconds after EC2 creation. +- Moving golden prep to the background could make cold-ready registration close + to that worker-service-start time, likely around 90-100 seconds from EC2 + creation before further AMI cleanup. + +With AMI/systemd cleanup, a realistic next target is roughly 45-70 seconds from +EC2 creation to cold-ready in favorable cases. + +## Risks + +- Cold-ready workers may serve slower first sandboxes until golden prep + completes. +- Some scheduling paths may implicitly assume a non-empty `golden_version`. + Those paths need review before allowing all workloads onto cold-ready workers. +- Migration/checkpoint paths that require a known source golden version should + continue to require it. + +## Validation Plan + +1. Build and deploy a worker with background golden prep. +2. Launch a fresh burst worker and capture timestamps: + - scaler launch decision + - EC2 instance created + - user-data start + - worker service start + - first Redis heartbeat / CP registration + - golden snapshot ready +3. Confirm the CP sees the worker before golden snapshot readiness. +4. Create a sandbox on the cold-ready worker and verify it succeeds via cold + boot. +5. Wait for golden-ready heartbeat and verify subsequent creates use the golden + path. +6. Terminate the extra worker after the test to avoid unnecessary cost. diff --git a/cmd/server/main.go b/cmd/server/main.go index 9d559121..0891f6eb 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -501,17 +501,20 @@ func main() { scalerState := controlplane.NewRedisScalerState(redisRegistry.RedisClient()) scaler := controlplane.NewScaler(controlplane.ScalerConfig{ - Pool: pool, - Registry: redisRegistry, - Store: opts.Store, - StateStore: scalerState, - WorkerImage: workerImage, - Cooldown: time.Duration(cfg.ScaleCooldownSec) * time.Second, - MinWorkers: cfg.MinWorkersPerRegion, - MaxWorkers: cfg.MaxWorkersPerRegion, - IdleReserve: cfg.IdleReserveWorkers, - WorkerPool: cfg.WorkerPool, - MachineSizes: machineSizes, + Pool: pool, + Registry: redisRegistry, + Store: opts.Store, + StateStore: scalerState, + WorkerImage: workerImage, + Cooldown: time.Duration(cfg.ScaleCooldownSec) * time.Second, + MinWorkers: cfg.MinWorkersPerRegion, + MaxWorkers: cfg.MaxWorkersPerRegion, + IdleReserve: cfg.IdleReserveWorkers, + MinIdleCapacity: cfg.MinIdleCapacity, + MinIdleCPUs: cfg.MinIdleCPUs, + DefaultSandboxCPUs: cfg.DefaultSandboxCPUs, + WorkerPool: cfg.WorkerPool, + MachineSizes: machineSizes, // For "migrated" event emit after scaler-driven migrations // (rolling replace, evacuation) — keeps D1 sandboxes_index // worker_id in sync with cell-PG truth. Without this, the diff --git a/internal/config/config.go b/internal/config/config.go index e3b6ea5c..9a21293d 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -154,6 +154,8 @@ type Config struct { MinWorkersPerRegion int // Minimum total workers per region, default 1 MaxWorkersPerRegion int // Maximum workers per region (hard cap), default 10 IdleReserveWorkers int // Target idle workers for burst absorption, default 1 + MinIdleCapacity int // Minimum spare sandbox capacity slots per region. When >0, overrides MinWorkers/IdleReserve. + MinIdleCPUs int // Minimum spare sandbox CPU units per region. When >0, overrides MinIdleCapacity. // Stripe billing StripeSecretKey string @@ -395,6 +397,8 @@ func Load() (*Config, error) { MinWorkersPerRegion: envOrDefaultInt("OPENSANDBOX_MIN_WORKERS", 1), MaxWorkersPerRegion: envOrDefaultInt("OPENSANDBOX_MAX_WORKERS", 10), IdleReserveWorkers: envOrDefaultInt("OPENSANDBOX_IDLE_RESERVE", 1), + MinIdleCapacity: envOrDefaultInt("OPENSANDBOX_MIN_IDLE_CAPACITY", 0), + MinIdleCPUs: envOrDefaultInt("OPENSANDBOX_MIN_IDLE_CPUS", 0), StripeSecretKey: os.Getenv("STRIPE_SECRET_KEY"), StripeWebhookSecret: os.Getenv("STRIPE_WEBHOOK_SECRET"), diff --git a/internal/config/secrets.go b/internal/config/secrets.go index 82e73c09..ce09df93 100644 --- a/internal/config/secrets.go +++ b/internal/config/secrets.go @@ -75,6 +75,8 @@ var secretMapping = map[string]string{ "server-sentry-dsn": "OPENSANDBOX_SENTRY_DSN", "server-azure-vm-sizes": "OPENSANDBOX_AZURE_VM_SIZES", "server-ec2-instance-types": "OPENSANDBOX_EC2_INSTANCE_TYPES", + "server-min-idle-capacity": "OPENSANDBOX_MIN_IDLE_CAPACITY", + "server-min-idle-cpus": "OPENSANDBOX_MIN_IDLE_CPUS", "server-ocfs2-node-ips": "OPENSANDBOX_OCFS2_NODE_IPS", "server-s3-access-key": "OPENSANDBOX_S3_ACCESS_KEY_ID", "server-s3-secret-key": "OPENSANDBOX_S3_SECRET_ACCESS_KEY", diff --git a/internal/controlplane/scaler.go b/internal/controlplane/scaler.go index c2e8c48f..5ebaef46 100644 --- a/internal/controlplane/scaler.go +++ b/internal/controlplane/scaler.go @@ -24,6 +24,7 @@ const ( scaleDownThreshold = 0.20 // Scale down when utilization < 20% maxWorkersPerRegion = 10 // Hard cap to prevent runaway launches pendingWorkerTTL = 10 * time.Minute // How long to wait for a launched worker to register + defaultWorkerCap = 50 // Capacity estimate for pending workers before first heartbeat. // Resource-based scaling thresholds (applied per-worker, trigger on ANY worker exceeding) resourceCPUThreshold = 70.0 // Scale up if any worker CPU > 70% @@ -90,7 +91,16 @@ type ScalerConfig struct { MinWorkers int // minimum total workers per region (0 = default 1). Always kept running. MaxWorkers int // maximum workers per region (0 = default 10). Hard cap to prevent runaway launches. IdleReserve int // target idle (0 sandbox) workers for burst absorption (0 = default 1). Separate from MinWorkers. - WorkerPool string // optional placement pool filter: ondemand or burst + // MinIdleCapacity is the target spare sandbox slot capacity per region. + // When >0, this replaces the worker-count based MinWorkers/IdleReserve + // pre-provisioning logic while still respecting MaxWorkers. + MinIdleCapacity int + // MinIdleCPUs is the target spare sandbox CPU units per region. When >0, + // it takes precedence over MinIdleCapacity and is converted to slots via + // DefaultSandboxCPUs. + MinIdleCPUs int + DefaultSandboxCPUs int + WorkerPool string // optional placement pool filter: ondemand or burst // Event emit for D1 sandboxes_index sync. After a scaler-triggered // migration succeeds (rolling replace, evacuation), XADD a "migrated" @@ -168,6 +178,9 @@ type Scaler struct { minWorkers int maxWorkers int idleReserve int + minIdleCap int + minIdleCPUs int + sandboxCPUs int workerPool string rdb *redis.Client @@ -207,6 +220,10 @@ func NewScaler(cfg ScalerConfig) *Scaler { if idleReserve < 0 { idleReserve = 0 } + sandboxCPUs := cfg.DefaultSandboxCPUs + if sandboxCPUs <= 0 { + sandboxCPUs = 1 + } stateStore := cfg.StateStore if stateStore == nil { stateStore = NewInMemoryScalerState() @@ -223,6 +240,9 @@ func NewScaler(cfg ScalerConfig) *Scaler { minWorkers: minWorkers, maxWorkers: maxWorkers, idleReserve: idleReserve, + minIdleCap: cfg.MinIdleCapacity, + minIdleCPUs: cfg.MinIdleCPUs, + sandboxCPUs: sandboxCPUs, workerPool: cfg.WorkerPool, machineSizes: cfg.MachineSizes, rdb: cfg.RedisClient, @@ -516,10 +536,15 @@ func (s *Scaler) evaluateRegion(ctx context.Context, region string) { } } - // Ensure minimum workers are running (pre-provisioned capacity). - // Ignores cooldowns but respects creation failure backoff. totalWorkers := len(workers) + len(s.state.GetPendingLaunches(region)) - if totalWorkers < s.minWorkers { + + if s.hasMinIdleReserve() { + if s.ensureMinIdleReserve(ctx, region, workers, totalWorkers) { + return + } + } else if totalWorkers < s.minWorkers { + // Ensure minimum workers are running (pre-provisioned capacity). + // Ignores cooldowns but respects creation failure backoff. if until, ok := s.state.GetCreationBackoffUntil(region); ok { log.Printf("scaler: region %s below minimum workers (%d/%d) but creation backoff active until %s", region, totalWorkers, s.minWorkers, until.Format(time.RFC3339)) @@ -534,29 +559,31 @@ func (s *Scaler) evaluateRegion(ctx context.Context, region string) { return } - // Headroom: maintain a pool of idle workers for burst absorption. - // Uses minWorkers as the reserve target — this is separate from the - // minimum total workers check above. When bin-packing overflows into - // reserve workers, we launch replacements one at a time so there's - // always warm capacity without thrashing. - idleWorkers := 0 - for _, w := range workers { - if s.state.IsDraining(w.MachineID) { - continue + if !s.hasMinIdleReserve() { + // Headroom: maintain a pool of idle workers for burst absorption. + // Uses minWorkers as the reserve target — this is separate from the + // minimum total workers check above. When bin-packing overflows into + // reserve workers, we launch replacements one at a time so there's + // always warm capacity without thrashing. + idleWorkers := 0 + for _, w := range workers { + if s.state.IsDraining(w.MachineID) { + continue + } + if w.Current == 0 { + idleWorkers++ + } } - if w.Current == 0 { - idleWorkers++ + pendingCount := len(s.state.GetPendingLaunches(region)) + reserveTarget := s.idleReserve + idleOrPending := idleWorkers + pendingCount + if idleOrPending < reserveTarget && totalWorkers < s.maxWorkers { + // Launch 1 at a time to avoid over-provisioning + log.Printf("scaler: region %s reserve low (%d idle + %d pending < %d target), launching 1", + region, idleWorkers, pendingCount, reserveTarget) + s.scaleUp(ctx, region) } } - pendingCount := len(s.state.GetPendingLaunches(region)) - reserveTarget := s.idleReserve - idleOrPending := idleWorkers + pendingCount - if idleOrPending < reserveTarget && totalWorkers+pendingCount < s.maxWorkers { - // Launch 1 at a time to avoid over-provisioning - log.Printf("scaler: region %s reserve low (%d idle + %d pending < %d target), launching 1", - region, idleWorkers, pendingCount, reserveTarget) - s.scaleUp(ctx, region) - } if needsScaleUp { // Cascade of guards. Important: each guard logs and SKIPS scale-up but @@ -596,7 +623,7 @@ func (s *Scaler) evaluateRegion(ctx context.Context, region string) { region, reason, maxCPU, maxMem, maxDisk, utilization*100) s.scaleUp(ctx, region) } - } else if utilization < scaleDownThreshold && len(workers) > s.minWorkers { + } else if utilization < scaleDownThreshold && s.canScaleDown(region, workers) { // Phase 4: Scale down via smart drain (live-migrate sandboxes, then destroy) log.Printf("scaler: region %s utilization %.1f%% < %.0f%%, initiating smart drain", region, utilization*100, scaleDownThreshold*100) s.smartScaleDown(ctx, region, workers) @@ -606,6 +633,92 @@ func (s *Scaler) evaluateRegion(ctx context.Context, region string) { s.rollingReplace(ctx, region, workers) } +func (s *Scaler) hasMinIdleReserve() bool { + return s.minIdleCPUs > 0 || s.minIdleCap > 0 +} + +func (s *Scaler) minIdleReserveTarget() (target, multiplier int, label string) { + if s.minIdleCPUs > 0 { + return s.minIdleCPUs, s.sandboxCPUs, "cpu" + } + return s.minIdleCap, 1, "slot" +} + +func (s *Scaler) ensureMinIdleReserve(ctx context.Context, region string, workers []*WorkerInfo, totalWorkers int) bool { + target, multiplier, label := s.minIdleReserveTarget() + idleCapacity, pendingCapacity, estimate := s.idleCapacity(region, workers, multiplier) + if idleCapacity+pendingCapacity >= target { + return false + } + if until, ok := s.state.GetCreationBackoffUntil(region); ok { + log.Printf("scaler: region %s idle %s capacity low (%d live + %d pending < %d target) but creation backoff active until %s", + region, label, idleCapacity, pendingCapacity, target, until.Format(time.RFC3339)) + return true + } + remainingWorkers := s.maxWorkers - totalWorkers + if remainingWorkers <= 0 { + log.Printf("scaler: region %s idle %s capacity low (%d live + %d pending < %d target) but at max workers (%d/%d)", + region, label, idleCapacity, pendingCapacity, target, totalWorkers, s.maxWorkers) + return false + } + + deficit := target - idleCapacity - pendingCapacity + launches := (deficit + estimate - 1) / estimate + if launches > remainingWorkers { + launches = remainingWorkers + } + log.Printf("scaler: region %s idle %s capacity low (%d live + %d pending < %d target), launching %d worker(s) using ~%d %s/worker", + region, label, idleCapacity, pendingCapacity, target, launches, estimate, label) + for i := 0; i < launches; i++ { + s.scaleUp(ctx, region) + } + return true +} + +func (s *Scaler) idleCapacity(region string, workers []*WorkerInfo, multiplier int) (idleCapacity, pendingCapacity, estimate int) { + if multiplier <= 0 { + multiplier = 1 + } + estimate = defaultWorkerCap * multiplier + for _, w := range workers { + if w.Draining || s.state.IsDraining(w.MachineID) { + continue + } + workerCapacity := w.Capacity * multiplier + if workerCapacity > estimate { + estimate = workerCapacity + } + if w.Capacity > w.Current { + idleCapacity += (w.Capacity - w.Current) * multiplier + } + } + pendingCapacity = len(s.state.GetPendingLaunches(region)) * estimate + return idleCapacity, pendingCapacity, estimate +} + +func (s *Scaler) canScaleDown(region string, workers []*WorkerInfo) bool { + if len(workers) <= s.minWorkers { + return false + } + if !s.hasMinIdleReserve() { + return true + } + + target, multiplier, _ := s.minIdleReserveTarget() + idleCapacity, pendingCapacity, _ := s.idleCapacity(region, workers, multiplier) + bestRemovable := 0 + for _, w := range workers { + if w.Draining || s.state.IsDraining(w.MachineID) { + continue + } + spare := (w.Capacity - w.Current) * multiplier + if spare > bestRemovable { + bestRemovable = spare + } + } + return idleCapacity+pendingCapacity-bestRemovable >= target +} + func (s *Scaler) scaleUp(_ context.Context, region string) { // Check creation failure backoff if until, ok := s.state.GetCreationBackoffUntil(region); ok { diff --git a/internal/controlplane/scaler_test.go b/internal/controlplane/scaler_test.go index a192a613..ab4f5852 100644 --- a/internal/controlplane/scaler_test.go +++ b/internal/controlplane/scaler_test.go @@ -402,6 +402,211 @@ func TestMinWorkersEnforced(t *testing.T) { } } +func TestMinIdleCapacityOverridesMinWorkers(t *testing.T) { + reg := newMockRegistry() + pool := newMockPool() + + reg.addWorker(&WorkerInfo{ + ID: "w1", MachineID: "osb-worker-w1", Region: "us-east-1", + Capacity: 50, Current: 0, CPUPct: 0, MemPct: 0, DiskPct: 0, + }) + + s := NewScaler(ScalerConfig{ + Pool: pool, + Registry: reg, + Cooldown: 1 * time.Second, + Interval: 100 * time.Millisecond, + MinWorkers: 5, + MaxWorkers: 10, + MinIdleCapacity: 120, + }) + + s.evaluateRegion(context.Background(), "us-east-1") + time.Sleep(50 * time.Millisecond) + + if got := atomic.LoadInt32(&pool.created); got != 2 { + t.Fatalf("expected 2 launches to add ~100 spare slots, got %d", got) + } +} + +func TestMinIdleCapacityCountsPendingLaunches(t *testing.T) { + reg := newMockRegistry() + pool := newMockPool() + + reg.addWorker(&WorkerInfo{ + ID: "w1", MachineID: "osb-worker-w1", Region: "us-east-1", + Capacity: 50, Current: 0, CPUPct: 0, MemPct: 0, DiskPct: 0, + }) + + s := NewScaler(ScalerConfig{ + Pool: pool, + Registry: reg, + Cooldown: 1 * time.Second, + Interval: 100 * time.Millisecond, + MinWorkers: 1, + MaxWorkers: 10, + MinIdleCapacity: 100, + }) + s.state.AddPendingLaunch("us-east-1", pendingLaunch{ + MachineID: "osb-worker-pending-test", + LaunchedAt: time.Now(), + }) + + s.evaluateRegion(context.Background(), "us-east-1") + time.Sleep(50 * time.Millisecond) + + if got := atomic.LoadInt32(&pool.created); got != 0 { + t.Fatalf("expected no launches because pending capacity satisfies reserve, got %d", got) + } +} + +func TestMinIdleCapacityRespectsMaxWorkers(t *testing.T) { + reg := newMockRegistry() + pool := newMockPool() + + for i := 0; i < 2; i++ { + reg.addWorker(&WorkerInfo{ + ID: fmt.Sprintf("w%d", i), MachineID: fmt.Sprintf("osb-worker-w%d", i), Region: "us-east-1", + Capacity: 50, Current: 45, CPUPct: 0, MemPct: 0, DiskPct: 0, + }) + } + + s := NewScaler(ScalerConfig{ + Pool: pool, + Registry: reg, + Cooldown: 1 * time.Second, + Interval: 100 * time.Millisecond, + MinWorkers: 1, + MaxWorkers: 2, + MinIdleCapacity: 100, + }) + + s.evaluateRegion(context.Background(), "us-east-1") + time.Sleep(50 * time.Millisecond) + + if got := atomic.LoadInt32(&pool.created); got != 0 { + t.Fatalf("expected no launches at max workers, got %d", got) + } +} + +func TestMinIdleCapacityPreventsScaleDownBelowReserve(t *testing.T) { + reg := newMockRegistry() + pool := newMockPool() + + reg.addWorker(&WorkerInfo{ + ID: "w1", MachineID: "osb-worker-w1", Region: "us-east-1", + Capacity: 50, Current: 0, CPUPct: 0, MemPct: 0, DiskPct: 0, + }) + reg.addWorker(&WorkerInfo{ + ID: "w2", MachineID: "osb-worker-w2", Region: "us-east-1", + Capacity: 50, Current: 0, CPUPct: 0, MemPct: 0, DiskPct: 0, + }) + + s := NewScaler(ScalerConfig{ + Pool: pool, + Registry: reg, + Cooldown: 1 * time.Second, + Interval: 100 * time.Millisecond, + MinWorkers: 1, + MaxWorkers: 10, + MinIdleCapacity: 80, + }) + + if s.canScaleDown("us-east-1", reg.GetWorkersByRegion("us-east-1")) { + t.Fatal("expected min idle capacity reserve to block scale-down") + } +} + +func TestMinIdleCPUsOverridesMinIdleCapacity(t *testing.T) { + reg := newMockRegistry() + pool := newMockPool() + + reg.addWorker(&WorkerInfo{ + ID: "w1", MachineID: "osb-worker-w1", Region: "us-east-1", + Capacity: 50, Current: 0, CPUPct: 0, MemPct: 0, DiskPct: 0, + }) + + s := NewScaler(ScalerConfig{ + Pool: pool, + Registry: reg, + Cooldown: 1 * time.Second, + Interval: 100 * time.Millisecond, + MinWorkers: 1, + MaxWorkers: 10, + MinIdleCapacity: 200, + MinIdleCPUs: 120, + DefaultSandboxCPUs: 2, + }) + + s.evaluateRegion(context.Background(), "us-east-1") + time.Sleep(50 * time.Millisecond) + + if got := atomic.LoadInt32(&pool.created); got != 1 { + t.Fatalf("expected 1 launch using cpu reserve target, got %d", got) + } +} + +func TestMinIdleCPUsCountsPendingLaunches(t *testing.T) { + reg := newMockRegistry() + pool := newMockPool() + + reg.addWorker(&WorkerInfo{ + ID: "w1", MachineID: "osb-worker-w1", Region: "us-east-1", + Capacity: 50, Current: 0, CPUPct: 0, MemPct: 0, DiskPct: 0, + }) + + s := NewScaler(ScalerConfig{ + Pool: pool, + Registry: reg, + Cooldown: 1 * time.Second, + Interval: 100 * time.Millisecond, + MinWorkers: 1, + MaxWorkers: 10, + MinIdleCPUs: 200, + DefaultSandboxCPUs: 2, + }) + s.state.AddPendingLaunch("us-east-1", pendingLaunch{ + MachineID: "osb-worker-pending-test", + LaunchedAt: time.Now(), + }) + + s.evaluateRegion(context.Background(), "us-east-1") + time.Sleep(50 * time.Millisecond) + + if got := atomic.LoadInt32(&pool.created); got != 0 { + t.Fatalf("expected no launches because pending CPU reserve satisfies target, got %d", got) + } +} + +func TestMinIdleCPUsPreventsScaleDownBelowReserve(t *testing.T) { + reg := newMockRegistry() + pool := newMockPool() + + reg.addWorker(&WorkerInfo{ + ID: "w1", MachineID: "osb-worker-w1", Region: "us-east-1", + Capacity: 50, Current: 0, CPUPct: 0, MemPct: 0, DiskPct: 0, + }) + reg.addWorker(&WorkerInfo{ + ID: "w2", MachineID: "osb-worker-w2", Region: "us-east-1", + Capacity: 50, Current: 0, CPUPct: 0, MemPct: 0, DiskPct: 0, + }) + + s := NewScaler(ScalerConfig{ + Pool: pool, + Registry: reg, + Cooldown: 1 * time.Second, + Interval: 100 * time.Millisecond, + MinWorkers: 1, + MaxWorkers: 10, + MinIdleCPUs: 160, + DefaultSandboxCPUs: 2, + }) + + if s.canScaleDown("us-east-1", reg.GetWorkersByRegion("us-east-1")) { + t.Fatal("expected min idle CPU reserve to block scale-down") + } +} + // ============================================================ // Test: Scale-down triggers // ============================================================ diff --git a/internal/qemu/manager.go b/internal/qemu/manager.go index 8033a8c9..f75bbe51 100644 --- a/internal/qemu/manager.go +++ b/internal/qemu/manager.go @@ -1380,15 +1380,16 @@ func (m *Manager) createFromGolden(ctx context.Context, cfg types.SandboxConfig, mountCtx, mountCancel := context.WithTimeout(context.Background(), 10*time.Second) _, mountErr := agentClient.Exec(mountCtx, &pb.ExecRequest{ Command: "/bin/sh", - Args: []string{"-c", strings.Join([]string{ - "echo 3 > /proc/sys/vm/drop_caches", - "echo 3 > /proc/sys/vm/drop_caches", - "mount /dev/vdb /home/sandbox 2>/dev/null || true", - "resize2fs /dev/vdb 2>/dev/null || true", - "chown 1000:1000 /home/sandbox", - "mkdir -p /home/sandbox/.osb-apt-cache /var/cache/apt/archives", - "mountpoint -q /var/cache/apt/archives || mount --bind /home/sandbox/.osb-apt-cache /var/cache/apt/archives 2>/dev/null || true", - }, " && ")}, + Args: []string{"-c", ` +set -u +echo 3 > /proc/sys/vm/drop_caches +echo 3 > /proc/sys/vm/drop_caches +mount /dev/vdb /home/sandbox 2>/dev/null || true +(timeout 30s resize2fs /dev/vdb >/dev/null 2>&1 || true) & +chown 1000:1000 /home/sandbox +mkdir -p /home/sandbox/.osb-apt-cache /var/cache/apt/archives +mountpoint -q /var/cache/apt/archives || mount --bind /home/sandbox/.osb-apt-cache /var/cache/apt/archives 2>/dev/null || true +`}, RunAsRoot: true, }) mountCancel() diff --git a/internal/worker/grpc_server.go b/internal/worker/grpc_server.go index d6e4840a..e5e92a69 100644 --- a/internal/worker/grpc_server.go +++ b/internal/worker/grpc_server.go @@ -9,6 +9,7 @@ import ( "os" "os/exec" "path/filepath" + "strconv" "strings" "time" @@ -88,6 +89,8 @@ type GRPCServer struct { // region is the worker's region label, used to tag operation metrics. // Set via SetRegion at startup. Empty = "unknown". region string + + createSem chan struct{} } // SetRegion stamps the worker's region onto operation metrics emitted from @@ -140,11 +143,42 @@ func NewGRPCServer(mgr sandbox.Manager, ptyMgr *sandbox.PTYManager, execMgr *san checkpointStore: checkpointStore, store: store, server: grpc.NewServer(serverOpts...), + createSem: make(chan struct{}, workerCreateConcurrency()), } pb.RegisterSandboxWorkerServer(s.server, s) return s } +func workerCreateConcurrency() int { + const defaultCreateConcurrency = 4 + raw := strings.TrimSpace(os.Getenv("OPENSANDBOX_WORKER_CREATE_CONCURRENCY")) + if raw == "" { + return defaultCreateConcurrency + } + n, err := strconv.Atoi(raw) + if err != nil || n <= 0 { + log.Printf("grpc: invalid OPENSANDBOX_WORKER_CREATE_CONCURRENCY=%q, using %d", raw, defaultCreateConcurrency) + return defaultCreateConcurrency + } + return n +} + +func (s *GRPCServer) acquireCreateSlot(ctx context.Context, sandboxID string) (func(), error) { + if s.createSem == nil { + return func() {}, nil + } + start := time.Now() + select { + case s.createSem <- struct{}{}: + if waited := time.Since(start); waited > 250*time.Millisecond { + log.Printf("grpc: CreateSandbox %s waited %s for worker create slot", sandboxID, waited.Round(time.Millisecond)) + } + return func() { <-s.createSem }, nil + case <-ctx.Done(): + return nil, ctx.Err() + } +} + // Start starts the gRPC server on the given address. func (s *GRPCServer) Start(addr string) error { lis, err := net.Listen("tcp", addr) @@ -204,6 +238,12 @@ func parseSecretAllowedHosts(m map[string]string) map[string][]string { } func (s *GRPCServer) CreateSandbox(ctx context.Context, req *pb.CreateSandboxRequest) (*pb.CreateSandboxResponse, error) { + releaseCreateSlot, err := s.acquireCreateSlot(ctx, req.SandboxId) + if err != nil { + return nil, status.Errorf(codes.Unavailable, "worker create queue cancelled: %v", err) + } + defer releaseCreateSlot() + cfg := types.SandboxConfig{ Template: req.Template, Timeout: int(req.Timeout), diff --git a/scripts/integration-tests/05-burst-edge-routing-smoke.ts b/scripts/integration-tests/05-burst-edge-routing-smoke.ts new file mode 100644 index 00000000..6bd25b76 --- /dev/null +++ b/scripts/integration-tests/05-burst-edge-routing-smoke.ts @@ -0,0 +1,196 @@ +/** + * Smoke test for Burst sandbox edge routing. + * + * Creates a Burst sandbox through the main public API, verifies basic exec, + * then proves the sandbox landed in the expected cell by either: + * - listing sandboxes through the direct cell/LB API, or + * - reading sandbox_sessions from Postgres. + * + * Usage: + * OPENCOMPUTER_API_KEY=osb_... \ + * OPENCOMPUTER_VERIFY_MODE=postgres \ + * OPENCOMPUTER_DATABASE_URL=postgres://... \ + * npx tsx scripts/integration-tests/05-burst-edge-routing-smoke.ts + * + * Or verify via the direct cell API: + * OPENCOMPUTER_API_KEY=osb_... \ + * OPENCOMPUTER_VERIFY_MODE=direct-list \ + * OPENCOMPUTER_DIRECT_API_URL=https://oc-alb-aws-us-east-2-burst-prod-972530125.us-east-2.elb.amazonaws.com \ + * OPENCOMPUTER_DIRECT_API_KEY=osb_... \ + * OPENCOMPUTER_DIRECT_TLS_INSECURE=1 \ + * npx tsx scripts/integration-tests/05-burst-edge-routing-smoke.ts + */ + +import { Sandbox } from "../../sdks/typescript/src"; +import { execFileSync } from "node:child_process"; + +const MAIN_API_URL = process.env.OPENCOMPUTER_API_URL || "https://app.opencomputer.dev"; +const MAIN_API_KEY = process.env.OPENCOMPUTER_API_KEY || ""; +const VERIFY_MODE = process.env.OPENCOMPUTER_VERIFY_MODE || "direct-list"; +const DIRECT_API_URL = + process.env.OPENCOMPUTER_DIRECT_API_URL || + "https://oc-alb-aws-us-east-2-burst-prod-972530125.us-east-2.elb.amazonaws.com"; +const DIRECT_API_KEY = process.env.OPENCOMPUTER_DIRECT_API_KEY || MAIN_API_KEY; +const DATABASE_URL = process.env.OPENCOMPUTER_DATABASE_URL || ""; + +if (process.env.OPENCOMPUTER_DIRECT_TLS_INSECURE === "1") { + process.env.NODE_TLS_REJECT_UNAUTHORIZED = "0"; +} + +function apiBase(url: string): string { + const base = url.replace(/\/+$/, ""); + return base.endsWith("/api") ? base : `${base}/api`; +} + +function requireEnv(name: string, value: string): void { + if (!value) { + throw new Error(`${name} is required`); + } +} + +interface SessionRow { + sandboxID: string; + status: string; + workerID: string; + region: string; + template: string; + config: Record; +} + +async function listDirectSandboxes(): Promise>> { + const resp = await fetch(`${apiBase(DIRECT_API_URL)}/sandboxes`, { + headers: DIRECT_API_KEY ? { "X-API-Key": DIRECT_API_KEY } : {}, + }); + if (!resp.ok) { + const text = await resp.text(); + throw new Error(`direct sandbox list failed: ${resp.status} ${text}`); + } + const data = await resp.json(); + if (!Array.isArray(data)) { + throw new Error(`direct sandbox list returned non-array response: ${JSON.stringify(data)}`); + } + return data; +} + +async function waitForDirectList(sandboxID: string, timeoutMs = 30_000): Promise> { + const deadline = Date.now() + timeoutMs; + let lastCount = 0; + + while (Date.now() < deadline) { + const sandboxes = await listDirectSandboxes(); + lastCount = sandboxes.length; + const found = sandboxes.find((sb) => sb.sandboxID === sandboxID || sb.id === sandboxID); + if (found) { + return found; + } + await new Promise((resolve) => setTimeout(resolve, 1_000)); + } + + throw new Error(`sandbox ${sandboxID} was not visible from direct cell list after ${timeoutMs}ms (last count=${lastCount})`); +} + +function queryPostgresSession(sandboxID: string): SessionRow { + if (!/^[a-zA-Z0-9_-]+$/.test(sandboxID)) { + throw new Error(`refusing unsafe sandbox id for SQL lookup: ${sandboxID}`); + } + + const query = ` + SELECT json_build_object( + 'sandboxID', sandbox_id, + 'status', status, + 'workerID', worker_id, + 'region', region, + 'template', template, + 'config', config + )::text + FROM sandbox_sessions + WHERE sandbox_id = '${sandboxID}' + ORDER BY started_at DESC + LIMIT 1; + `; + + const out = execFileSync("psql", [DATABASE_URL, "-XAt", "-c", query], { + encoding: "utf8", + stdio: ["ignore", "pipe", "pipe"], + }).trim(); + + if (!out) { + throw new Error(`sandbox ${sandboxID} was not found in sandbox_sessions`); + } + return JSON.parse(out) as SessionRow; +} + +async function main() { + requireEnv("OPENCOMPUTER_API_KEY", MAIN_API_KEY); + + console.log(`main API: ${MAIN_API_URL}`); + console.log(`verify mode: ${VERIFY_MODE}`); + + if (VERIFY_MODE === "direct-list") { + requireEnv("OPENCOMPUTER_DIRECT_API_KEY", DIRECT_API_KEY); + console.log(`direct API: ${DIRECT_API_URL}`); + const before = await listDirectSandboxes(); + console.log(`direct list before create: ${before.length} running sandbox(es)`); + } else if (VERIFY_MODE === "postgres") { + requireEnv("OPENCOMPUTER_DATABASE_URL", DATABASE_URL); + } else { + throw new Error(`unsupported OPENCOMPUTER_VERIFY_MODE=${VERIFY_MODE}`); + } + + const sandbox = await Sandbox.create({ + apiUrl: MAIN_API_URL, + apiKey: MAIN_API_KEY, + template: "base", + burst: true, + timeout: 0, + }); + + console.log(`created burst sandbox: ${sandbox.sandboxId} (${sandbox.status})`); + + try { + const result = await sandbox.exec.run("echo hello world", { timeout: 30 }); + const stdout = result.stdout.trim(); + if (result.exitCode !== 0 || stdout !== "hello world") { + throw new Error(`exec failed: exit=${result.exitCode}, stdout=${JSON.stringify(stdout)}, stderr=${JSON.stringify(result.stderr)}`); + } + console.log("exec check: ok"); + + if (VERIFY_MODE === "direct-list") { + const direct = await waitForDirectList(sandbox.sandboxId); + console.log(`direct list check: ok (${sandbox.sandboxId})`); + if (direct.workerID || direct.workerId || direct.worker_id) { + console.log(`worker: ${String(direct.workerID || direct.workerId || direct.worker_id)}`); + } + if (direct.region) { + console.log(`region: ${String(direct.region)}`); + } + } else { + const row = queryPostgresSession(sandbox.sandboxId); + if (row.status !== "running") { + throw new Error(`expected running session, got ${row.status}`); + } + if (!row.workerID) { + throw new Error("session has no workerID"); + } + if (row.config?.burst !== true && row.config?.resumable !== true && row.config?.sandboxFamily !== "spot") { + throw new Error(`session config does not look like Burst: ${JSON.stringify(row.config)}`); + } + console.log(`postgres check: ok (${row.sandboxID})`); + console.log(`worker: ${row.workerID}`); + console.log(`region: ${row.region}`); + console.log(`template: ${row.template}`); + } + } finally { + try { + await sandbox.kill(); + console.log(`cleaned up: ${sandbox.sandboxId}`); + } catch (err) { + console.error(`cleanup failed for ${sandbox.sandboxId}:`, err); + } + } +} + +main().catch((err) => { + console.error(err); + process.exit(1); +});