diff --git a/acceptance/bundle/validate/immutable_workspace_paths/databricks.yml b/acceptance/bundle/validate/immutable_workspace_paths/databricks.yml new file mode 100644 index 0000000000..39c25fb365 --- /dev/null +++ b/acceptance/bundle/validate/immutable_workspace_paths/databricks.yml @@ -0,0 +1,21 @@ +bundle: + name: my-bundle + immutable: true + +sync: + exclude: + # Test framework files that are not part of the bundle source. + - "repls.json" + - "user_repls.json" + - "script" + - "*.toml" + +resources: + jobs: + my_job: + name: my job + tasks: + - task_key: my_task + existing_cluster_id: "0101-120000-aaaaaaaa" + spark_python_task: + python_file: ./src/main.py diff --git a/acceptance/bundle/validate/immutable_workspace_paths/out.test.toml b/acceptance/bundle/validate/immutable_workspace_paths/out.test.toml new file mode 100644 index 0000000000..f784a18325 --- /dev/null +++ b/acceptance/bundle/validate/immutable_workspace_paths/out.test.toml @@ -0,0 +1,3 @@ +Local = true +Cloud = false +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["terraform", "direct"] diff --git a/acceptance/bundle/validate/immutable_workspace_paths/output.txt b/acceptance/bundle/validate/immutable_workspace_paths/output.txt new file mode 100644 index 0000000000..f6a8004bf2 --- /dev/null +++ b/acceptance/bundle/validate/immutable_workspace_paths/output.txt @@ -0,0 +1,30 @@ + +>>> [CLI] bundle validate -o json +Warning: Pattern user_repls.json does not match any files + at sync.exclude[1] + in databricks.yml:9:7 + +{ + "workspace": { + "artifact_path": "/Workspace/Users/[USERNAME]/.bundle/my-bundle/default/artifacts", + "current_user": { + "domain_friendly_name": "[USERNAME]", + "id": "[USERID]", + "short_name": "[USERNAME]", + "userName": "[USERNAME]" + }, + "file_path": "/Workspace/Users/[USERNAME]/.bundle/my-bundle/default/files", + "resource_path": "/Workspace/Users/[USERNAME]/.bundle/my-bundle/default/resources", + "root_path": "/Workspace/Users/[USERNAME]/.bundle/my-bundle/default", + "state_path": "/Workspace/Users/[USERNAME]/.bundle/my-bundle/default/state" + }, + "tasks": [ + { + "existing_cluster_id": "0101-120000-aaaaaaaa", + "spark_python_task": { + "python_file": "${workspace.snapshot_path}/src/files/src/main.py" + }, + "task_key": "my_task" + } + ] +} diff --git a/acceptance/bundle/validate/immutable_workspace_paths/script b/acceptance/bundle/validate/immutable_workspace_paths/script new file mode 100644 index 0000000000..df056fa9b9 --- /dev/null +++ b/acceptance/bundle/validate/immutable_workspace_paths/script @@ -0,0 +1 @@ +trace $CLI bundle validate -o json | jq '{workspace: .workspace, tasks: .resources.jobs.my_job.tasks}' diff --git a/acceptance/bundle/validate/immutable_workspace_paths/src/main.py b/acceptance/bundle/validate/immutable_workspace_paths/src/main.py new file mode 100644 index 0000000000..11b15b1a45 --- /dev/null +++ b/acceptance/bundle/validate/immutable_workspace_paths/src/main.py @@ -0,0 +1 @@ +print("hello") diff --git a/acceptance/bundle/validate/immutable_workspace_paths/test.toml b/acceptance/bundle/validate/immutable_workspace_paths/test.toml new file mode 100644 index 0000000000..85e02532c9 --- /dev/null +++ b/acceptance/bundle/validate/immutable_workspace_paths/test.toml @@ -0,0 +1,3 @@ +Local = true +Cloud = false +Ignore = [".databricks"] diff --git a/bundle/config/bundle.go b/bundle/config/bundle.go index ce6d25bfe6..79f92b6b43 100644 --- a/bundle/config/bundle.go +++ b/bundle/config/bundle.go @@ -59,4 +59,11 @@ type Bundle struct { // A stable generated UUID for the bundle. This is normally serialized by // Databricks first party template when a user runs bundle init. Uuid string `json:"uuid,omitempty"` + + // Immutable specifies that bundle files and artifacts are uploaded as a single + // immutable snapshot rather than being synced individually. When true, the + // deployment calls /api/2.0/repos/snapshots with a zip containing all files + // and sets workspace.file_path and workspace.artifact_path to the returned + // content-addressed path. validate and plan make no mutative API calls. + Immutable bool `json:"immutable,omitempty"` } diff --git a/bundle/config/mutator/resolve_variable_references.go b/bundle/config/mutator/resolve_variable_references.go index 113f057639..70a330c8a3 100644 --- a/bundle/config/mutator/resolve_variable_references.go +++ b/bundle/config/mutator/resolve_variable_references.go @@ -59,6 +59,11 @@ type resolveVariableReferences struct { includeResources bool artifactsReferenceUsed bool + + // excludePaths lists variable reference paths (e.g. "workspace.file_path") whose + // resolution should be skipped. References to these paths remain unresolved so a + // later mutator can set the value and re-run resolution. + excludePaths []string } func ResolveVariableReferencesOnlyResources(prefixes ...string) bundle.Mutator { @@ -74,6 +79,22 @@ func ResolveVariableReferencesOnlyResources(prefixes ...string) bundle.Mutator { } } +// ResolveVariableReferencesOnlyResourcesExcluding resolves variable references in +// resources while leaving references to the specified paths unresolved. +// Used by ProcessStaticResources for immutable bundles so that ${workspace.snapshot_path} +// is not resolved during Initialize; it is resolved in the Deploy phase after +// snapshot.Upload() sets workspace.snapshot_path to the API-assigned path. +func ResolveVariableReferencesOnlyResourcesExcluding(excludePaths ...string) bundle.Mutator { + return &resolveVariableReferences{ + prefixes: defaultPrefixes, + lookupFn: lookup, + extraRounds: maxResolutionRounds - 1, + pattern: dyn.NewPattern(dyn.Key("resources")), + includeResources: true, + excludePaths: excludePaths, + } +} + func ResolveVariableReferencesWithoutResources(prefixes ...string) bundle.Mutator { if len(prefixes) == 0 { prefixes = defaultPrefixes @@ -229,6 +250,9 @@ func (m *resolveVariableReferences) resolveOnce(b *bundle.Bundle, prefixes []dyn // Perform resolution only if the path starts with one of the specified prefixes. if slices.ContainsFunc(prefixes, path.HasPrefix) { + if slices.Contains(m.excludePaths, path.String()) { + return dyn.InvalidValue, dynvar.ErrSkipResolution + } value, err := m.lookupFn(normalized, path, b) hasUpdates = hasUpdates || (err == nil && value.IsValid()) return value, err diff --git a/bundle/config/mutator/resolve_variable_references_test.go b/bundle/config/mutator/resolve_variable_references_test.go index 876980e948..f682419f32 100644 --- a/bundle/config/mutator/resolve_variable_references_test.go +++ b/bundle/config/mutator/resolve_variable_references_test.go @@ -6,7 +6,9 @@ import ( "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/config" "github.com/databricks/cli/bundle/config/resources" + "github.com/databricks/databricks-sdk-go/service/jobs" "github.com/databricks/databricks-sdk-go/service/pipelines" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -63,3 +65,46 @@ func TestResolveVariableReferencesWithSourceLinkedDeployment(t *testing.T) { testCase.assert(t, b) } } + +// TestResolveVariableReferencesExcludePaths verifies that paths listed in excludePaths +// are skipped during resolution and left as unresolved variable references. +// This is used by ProcessStaticResources for immutable bundles so that +// ${workspace.file_path} and ${workspace.artifact_path} can be resolved later +// (in the Build phase, after artifacts are built and the correct snapshot path is known). +func TestResolveVariableReferencesExcludePaths(t *testing.T) { + b := &bundle.Bundle{ + Config: config.Root{ + Workspace: config.Workspace{ + FilePath: "/snapshot/path/src/files", + ArtifactPath: "/snapshot/path/src/artifacts", + }, + Resources: config.Resources{ + Jobs: map[string]*resources.Job{ + "job1": { + JobSettings: jobs.JobSettings{ + Tasks: []jobs.Task{ + { + SparkPythonTask: &jobs.SparkPythonTask{ + PythonFile: "${workspace.file_path}/main.py", + }, + }, + }, + }, + }, + }, + }, + }, + } + + // With exclusion: ${workspace.file_path} should remain unresolved. + diags := bundle.Apply(t.Context(), b, ResolveVariableReferencesOnlyResourcesExcluding("workspace.file_path", "workspace.artifact_path")) + require.NoError(t, diags.Error()) + assert.Equal(t, "${workspace.file_path}/main.py", b.Config.Resources.Jobs["job1"].Tasks[0].SparkPythonTask.PythonFile, + "reference should remain unresolved when path is excluded") + + // Without exclusion: ${workspace.file_path} should resolve normally. + diags = bundle.Apply(t.Context(), b, ResolveVariableReferencesOnlyResources()) + require.NoError(t, diags.Error()) + assert.Equal(t, "/snapshot/path/src/files/main.py", b.Config.Resources.Jobs["job1"].Tasks[0].SparkPythonTask.PythonFile, + "reference should resolve after exclusion is lifted") +} diff --git a/bundle/config/mutator/resourcemutator/process_static_resources.go b/bundle/config/mutator/resourcemutator/process_static_resources.go index 3f859fd11c..94c823c75b 100644 --- a/bundle/config/mutator/resourcemutator/process_static_resources.go +++ b/bundle/config/mutator/resourcemutator/process_static_resources.go @@ -38,13 +38,26 @@ func (p processStaticResources) Apply(ctx context.Context, b *bundle.Bundle) dia // we need to resolve variables because they can change path values: // - variable can be used a prefix // - path can be part of a complex variable value + + // For immutable bundles, defer resolving ${workspace.snapshot_path} in resources. + // The actual snapshot path is only known after snapshot.Upload() returns the + // API-assigned path in the deploy phase. + var resourceResolver bundle.Mutator + if b.Config.Bundle.Immutable { + resourceResolver = mutator.ResolveVariableReferencesOnlyResourcesExcluding( + "workspace.snapshot_path", + ) + } else { + resourceResolver = mutator.ResolveVariableReferencesOnlyResources() + } + bundle.ApplySeqContext( ctx, b, // Reads (dynamic): * (strings) (searches for variable references in string values) // Updates (dynamic): resources.* (strings) (resolves variable references to their actual values) // Resolves variable references in 'resources' using bundle, workspace, and variables prefixes - mutator.ResolveVariableReferencesOnlyResources(), + resourceResolver, // After normal variable resolution, log all ${resources.*} references mutator.LogResourceReferences(), mutator.NormalizePaths(), diff --git a/bundle/config/mutator/resourcemutator/resource_mutator.go b/bundle/config/mutator/resourcemutator/resource_mutator.go index 2eb292cfbb..31afb65ffa 100644 --- a/bundle/config/mutator/resourcemutator/resource_mutator.go +++ b/bundle/config/mutator/resourcemutator/resource_mutator.go @@ -127,6 +127,19 @@ func applyInitializeMutators(ctx context.Context, b *bundle.Bundle) { ) } +// resourceVarResolver returns a mutator that resolves variable references in +// resources. For immutable bundles, ${workspace.file_path} and +// ${workspace.artifact_path} are excluded: the API assigns the snapshot path +// after upload, so they must remain as-is until snapshot.Upload() has run. +func resourceVarResolver(b *bundle.Bundle) bundle.Mutator { + if b.Config.Bundle.Immutable { + return mutator.ResolveVariableReferencesOnlyResourcesExcluding( + "workspace.file_path", "workspace.artifact_path", + ) + } + return mutator.ResolveVariableReferencesOnlyResources() +} + // Normalization is applied multiple times if resource is modified during initialization // // If bundle is modified outside of 'resources' section, these changes are discarded. @@ -139,8 +152,10 @@ func applyNormalizeMutators(ctx context.Context, b *bundle.Bundle) { // Reads (dynamic): * (strings) (searches for variable references in string values) // Updates (dynamic): resources.* (strings) (resolves variable references to their actual values) - // Resolves variable references in 'resources' using bundle, workspace, and variables prefixes - mutator.ResolveVariableReferencesOnlyResources(), + // Resolves variable references in 'resources' using bundle, workspace, and variables prefixes. + // For immutable bundles, ${workspace.file_path} and ${workspace.artifact_path} are left + // unresolved: the actual snapshot path is assigned by the API after upload, not pre-computed. + resourceVarResolver(b), // Reads (dynamic): resources.pipelines.*.libraries (checks for notebook.path and file.path fields) // Updates (dynamic): resources.pipelines.*.libraries (expands glob patterns in path fields to multiple library entries) diff --git a/bundle/config/mutator/translate_paths.go b/bundle/config/mutator/translate_paths.go index 99dd75dd78..1d38cee236 100644 --- a/bundle/config/mutator/translate_paths.go +++ b/bundle/config/mutator/translate_paths.go @@ -320,12 +320,21 @@ func (t *translateContext) rewriteValue(ctx context.Context, p dyn.Path, v dyn.V return dyn.NewValue(out, v.Locations()), nil } +// snapshotFilesRoot is the remote root used for file/notebook path translation +// in immutable bundles. References to this placeholder are resolved after +// snapshot.Upload() sets workspace.snapshot_path to the API-assigned path. +const snapshotFilesRoot = "${workspace.snapshot_path}/src/files" + func applyTranslations(ctx context.Context, b *bundle.Bundle, t *translateContext, translations []func(context.Context, dyn.Value) (dyn.Value, error)) diag.Diagnostics { - // Set the remote root to the sync root if source-linked deployment is enabled. - // Otherwise, set it to the workspace file path. - if config.IsExplicitlyEnabled(t.b.Config.Presets.SourceLinkedDeployment) { + switch { + case b.Config.Bundle.Immutable: + // Use a placeholder root that is resolved after snapshot.Upload() sets + // workspace.snapshot_path. This defers path computation until the actual + // content-addressed path is known. + t.remoteRoot = snapshotFilesRoot + case config.IsExplicitlyEnabled(t.b.Config.Presets.SourceLinkedDeployment): t.remoteRoot = t.b.SyncRootPath - } else { + default: t.remoteRoot = t.b.Config.Workspace.FilePath } diff --git a/bundle/config/workspace.go b/bundle/config/workspace.go index 9cd397f13a..284bd0afe0 100644 --- a/bundle/config/workspace.go +++ b/bundle/config/workspace.go @@ -80,6 +80,12 @@ type Workspace struct { // Remote workspace path for deployment state. // This defaults to "${workspace.root}/state". StatePath string `json:"state_path,omitempty"` + + // SnapshotPath is the workspace path of the immutable snapshot uploaded during + // deployment. It is set by snapshot.Upload() and used to resolve + // ${workspace.snapshot_path} references in resource configurations. + // Only populated for bundles with bundle.immutable = true. + SnapshotPath string `json:"snapshot_path,omitempty" bundle:"internal"` } type User struct { diff --git a/bundle/deploy/metadata/compute.go b/bundle/deploy/metadata/compute.go index cb7be9811c..08a45d7a17 100644 --- a/bundle/deploy/metadata/compute.go +++ b/bundle/deploy/metadata/compute.go @@ -105,6 +105,7 @@ func (m *compute) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics // Set file upload destination of the bundle in metadata b.Metadata.Config.Workspace.FilePath = b.Config.Workspace.FilePath + b.Metadata.Config.Workspace.SnapshotPath = b.Config.Workspace.SnapshotPath // In source-linked deployment files are not copied and resources use source files, therefore we use sync path as file path in metadata if config.IsExplicitlyEnabled(b.Config.Presets.SourceLinkedDeployment) { b.Metadata.Config.Workspace.FilePath = b.SyncRootPath diff --git a/bundle/deploy/metadata/load.go b/bundle/deploy/metadata/load.go new file mode 100644 index 0000000000..3fe4eb1c3b --- /dev/null +++ b/bundle/deploy/metadata/load.go @@ -0,0 +1,57 @@ +package metadata + +import ( + "context" + "encoding/json" + "io" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/bundle/metadata" + "github.com/databricks/cli/libs/diag" + "github.com/databricks/cli/libs/filer" +) + +type load struct{} + +// Load reads the metadata file written during the last deploy and populates +// fields on the bundle that are not available locally (e.g. workspace.snapshot_path +// for immutable bundles, which is only known after snapshot.Upload() ran). +func Load() bundle.Mutator { + return &load{} +} + +func (m *load) Name() string { + return "metadata.Load" +} + +func (m *load) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { + f, err := filer.NewWorkspaceFilesClient(b.WorkspaceClient(ctx), b.Config.Workspace.StatePath) + if err != nil { + return diag.FromErr(err) + } + + r, err := f.Read(ctx, metadataFileName) + if err != nil { + // Missing metadata file means the bundle was never deployed or was + // deployed by an older CLI version that didn't write metadata. Treat + // it as a no-op so destroy can still proceed. + return nil + } + defer r.Close() + + raw, err := io.ReadAll(r) + if err != nil { + return diag.FromErr(err) + } + + var md metadata.Metadata + if err := json.Unmarshal(raw, &md); err != nil { + return diag.FromErr(err) + } + + if md.Config.Workspace.SnapshotPath != "" { + b.Config.Workspace.SnapshotPath = md.Config.Workspace.SnapshotPath + } + + return nil +} diff --git a/bundle/deploy/snapshot/path.go b/bundle/deploy/snapshot/path.go new file mode 100644 index 0000000000..820f20776c --- /dev/null +++ b/bundle/deploy/snapshot/path.go @@ -0,0 +1,227 @@ +package snapshot + +import ( + "archive/zip" + "bytes" + "context" + "crypto/sha256" + "encoding/hex" + "fmt" + "io" + "os" + "path/filepath" + "slices" + "strings" + "time" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/libs/fileset" + "github.com/databricks/cli/libs/git" + "github.com/databricks/cli/libs/notebook" + "github.com/databricks/cli/libs/set" +) + +// zipEpoch is a fixed timestamp used for all zip entries to make the zip content-addressed +// and reproducible: the same file content always produces the same hash regardless of when +// the zip was built or the file's mtime. +var zipEpoch = time.Date(2000, 1, 1, 0, 0, 0, 0, time.UTC) + +// snapshotBaseDir is the workspace path under which all immutable snapshots are stored. +// The full path for a snapshot is snapshotBaseDir/{bundle_name}/{snapshot_id}. +const snapshotBaseDir = "/Workspace/Shared/.snapshots" + +// BundleZip builds the zip that is uploaded to the snapshot API. +// It contains: +// - all files from the bundle sync root under the "files/" prefix, +// selected with the same git-aware + include/exclude logic as files.Upload +// - all built artifact files under the "artifacts/.internal/" prefix +// +// The snapshot ID is always IDFromContent(BundleZip(b)), ensuring the +// pre-calculated path and the uploaded path are derived from the same content. +func BundleZip(ctx context.Context, b *bundle.Bundle) ([]byte, error) { + var buf bytes.Buffer + zw := zip.NewWriter(&buf) + + if err := addSyncRootToZip(ctx, zw, b); err != nil { + return nil, err + } + if err := addArtifactsToZip(zw, b); err != nil { + return nil, err + } + + if err := zw.Close(); err != nil { + return nil, err + } + return buf.Bytes(), nil +} + +// IDFromContent returns the SHA-256 hex digest of content. +func IDFromContent(content []byte) string { + h := sha256.Sum256(content) + return hex.EncodeToString(h[:]) +} + +// SnapshotID builds the bundle zip and returns its SHA-256 hex digest. +// Called after artifacts are built so that ApplyImmutableWorkspacePaths and +// snapshot.Upload both hash identical content. +func SnapshotID(ctx context.Context, b *bundle.Bundle) (string, error) { + content, err := BundleZip(ctx, b) + if err != nil { + return "", err + } + return IDFromContent(content), nil +} + +// SnapshotPath returns the workspace path for a snapshot: +// snapshotBaseDir/{bundleName}/{snapshotID}. +func SnapshotPath(bundleName, snapshotID string) string { + return snapshotBaseDir + "/" + bundleName + "/" + snapshotID +} + +// syncFiles returns the list of files to include in the snapshot zip using the +// same git-aware include/exclude logic as files.Upload (libs/sync). +func syncFiles(ctx context.Context, b *bundle.Bundle) ([]fileset.File, error) { + // Use git.NewFileSet so that .gitignore rules are respected, matching the + // behaviour of the normal files.Upload sync path. + // Avoid passing an empty/nil paths slice: git.NewFileSet forwards it to + // fileset.New whose variadic default ("." if no args) is bypassed when the + // caller explicitly passes a nil slice. The SyncDefaultPath mutator always + // sets Sync.Paths to ["."] in the normal pipeline; we replicate that here + // so BundleZip works even when the bundle hasn't gone through the full pipeline. + var gitFS *git.FileSet + var err error + if len(b.Config.Sync.Paths) > 0 { + gitFS, err = git.NewFileSet(ctx, b.WorktreeRoot, b.SyncRoot, b.Config.Sync.Paths) + } else { + gitFS, err = git.NewFileSet(ctx, b.WorktreeRoot, b.SyncRoot) + } + if err != nil { + return nil, fmt.Errorf("build file set: %w", err) + } + + all := set.NewSetF(func(f fileset.File) string { + return f.Relative + }) + + gitFiles, err := gitFS.Files() + if err != nil { + return nil, fmt.Errorf("list sync files: %w", err) + } + all.Add(gitFiles...) + + if len(b.Config.Sync.Include) > 0 { + includeFS, err := fileset.NewGlobSet(b.SyncRoot, b.Config.Sync.Include) + if err != nil { + return nil, fmt.Errorf("build include set: %w", err) + } + include, err := includeFS.Files() + if err != nil { + return nil, fmt.Errorf("list include files: %w", err) + } + all.Add(include...) + } + + if len(b.Config.Sync.Exclude) > 0 { + excludeFS, err := fileset.NewGlobSet(b.SyncRoot, b.Config.Sync.Exclude) + if err != nil { + return nil, fmt.Errorf("build exclude set: %w", err) + } + exclude, err := excludeFS.Files() + if err != nil { + return nil, fmt.Errorf("list exclude files: %w", err) + } + for _, f := range exclude { + all.Remove(f) + } + } + + files := all.Iter() + // Sort for a stable zip (same content → same hash regardless of map iteration order). + slices.SortFunc(files, func(a, b fileset.File) int { + if a.Relative < b.Relative { + return -1 + } + if a.Relative > b.Relative { + return 1 + } + return 0 + }) + return files, nil +} + +func addSyncRootToZip(ctx context.Context, zw *zip.Writer, b *bundle.Bundle) error { + files, err := syncFiles(ctx, b) + if err != nil { + return err + } + + for _, f := range files { + rc, err := b.SyncRoot.Open(f.Relative) + if err != nil { + return fmt.Errorf("open %s: %w", f.Relative, err) + } + + // Notebooks are stored without their file extension, matching how + // Databricks workspace imports them (e.g. sample_notebook.ipynb → + // sample_notebook). Job tasks reference the extension-stripped path. + entryPath := filepath.ToSlash(f.Relative) + if isNb, _, nbErr := notebook.DetectWithFS(b.SyncRoot, f.Relative); nbErr == nil && isNb { + entryPath = strings.TrimSuffix(entryPath, filepath.Ext(entryPath)) + } + + h := &zip.FileHeader{ + Name: "files/" + entryPath, + Method: zip.Deflate, + Modified: zipEpoch, + } + w, err := zw.CreateHeader(h) + if err != nil { + rc.Close() + return fmt.Errorf("zip entry for %s: %w", f.Relative, err) + } + _, err = io.Copy(w, rc) + rc.Close() + if err != nil { + return fmt.Errorf("write %s: %w", f.Relative, err) + } + } + return nil +} + +func addArtifactsToZip(zw *zip.Writer, b *bundle.Bundle) error { + for _, artifact := range b.Config.Artifacts { + for _, af := range artifact.Files { + source := af.Source + if af.Patched != "" { + source = af.Patched + } + // ".internal" matches libraries.InternalDirName so that ReplaceWithRemotePath + // produces library paths that resolve correctly inside the snapshot. + if err := addLocalFileToZip(zw, source, "artifacts/.internal"); err != nil { + return err + } + } + } + return nil +} + +func addLocalFileToZip(zw *zip.Writer, localPath, zipPrefix string) error { + f, err := os.Open(localPath) + if err != nil { + return fmt.Errorf("open %s: %w", localPath, err) + } + defer f.Close() + + entryName := zipPrefix + "/" + filepath.Base(localPath) + h := &zip.FileHeader{ + Name: entryName, + Method: zip.Deflate, + Modified: zipEpoch, + } + w, err := zw.CreateHeader(h) + if err != nil { + return fmt.Errorf("zip entry %s: %w", entryName, err) + } + _, err = io.Copy(w, f) + return err +} diff --git a/bundle/deploy/snapshot/path_test.go b/bundle/deploy/snapshot/path_test.go new file mode 100644 index 0000000000..f5bc2ab610 --- /dev/null +++ b/bundle/deploy/snapshot/path_test.go @@ -0,0 +1,136 @@ +package snapshot_test + +import ( + "archive/zip" + "bytes" + "os" + "path/filepath" + "slices" + "testing" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/bundle/config" + "github.com/databricks/cli/bundle/deploy/snapshot" + "github.com/databricks/cli/libs/vfs" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func makeBundleWithFiles(t *testing.T, files map[string]string) *bundle.Bundle { + t.Helper() + dir := t.TempDir() + for name, content := range files { + p := filepath.Join(dir, name) + require.NoError(t, os.MkdirAll(filepath.Dir(p), 0o755)) + require.NoError(t, os.WriteFile(p, []byte(content), 0o644)) + } + root := vfs.MustNew(dir) + return &bundle.Bundle{ + BundleRootPath: dir, + SyncRoot: root, + // WorktreeRoot = SyncRoot is the fallback used by LoadGitDetails when + // there is no git repository. + WorktreeRoot: root, + Config: config.Root{}, + } +} + +func TestBundleZipIsDeterministic(t *testing.T) { + b := makeBundleWithFiles(t, map[string]string{ + "main.py": "print('hello')", + "src/task.py": "def run(): pass", + }) + + zip1, err := snapshot.BundleZip(t.Context(), b) + require.NoError(t, err) + zip2, err := snapshot.BundleZip(t.Context(), b) + require.NoError(t, err) + + assert.Equal(t, zip1, zip2, "BundleZip must produce identical bytes for identical content") +} + +func TestBundleZipChangesWithContent(t *testing.T) { + b1 := makeBundleWithFiles(t, map[string]string{"main.py": "v1"}) + b2 := makeBundleWithFiles(t, map[string]string{"main.py": "v2"}) + + zip1, err := snapshot.BundleZip(t.Context(), b1) + require.NoError(t, err) + zip2, err := snapshot.BundleZip(t.Context(), b2) + require.NoError(t, err) + + assert.NotEqual(t, zip1, zip2, "different file content must produce different zips") +} + +func TestBundleZipRespectsExcludes(t *testing.T) { + b := makeBundleWithFiles(t, map[string]string{ + "main.py": "print('hello')", + "skip.json": `{"id": "runtime-generated"}`, + }) + bExclude := makeBundleWithFiles(t, map[string]string{ + "main.py": "print('hello')", + "skip.json": `{"id": "runtime-generated"}`, + }) + bExclude.Config.Sync.Exclude = []string{"*.json"} + + zipAll, err := snapshot.BundleZip(t.Context(), b) + require.NoError(t, err) + zipExcl, err := snapshot.BundleZip(t.Context(), bExclude) + require.NoError(t, err) + + // The zip without the excluded file should be smaller and different. + assert.NotEqual(t, zipAll, zipExcl) + assert.Less(t, len(zipExcl), len(zipAll)) +} + +func TestIDFromContent(t *testing.T) { + id := snapshot.IDFromContent([]byte("hello")) + // SHA-256 of "hello" + assert.Equal(t, "2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824", id) + assert.Len(t, id, 64, "SHA-256 hex must be 64 characters") +} + +func TestSnapshotPath(t *testing.T) { + p := snapshot.SnapshotPath("my-bundle", "abc123") + assert.Equal(t, "/Workspace/Shared/.snapshots/my-bundle/abc123", p) +} + +func TestSnapshotIDMatchesBundleZipHash(t *testing.T) { + b := makeBundleWithFiles(t, map[string]string{"task.py": "x = 1"}) + + zipContent, err := snapshot.BundleZip(t.Context(), b) + require.NoError(t, err) + expectedID := snapshot.IDFromContent(zipContent) + + id, err := snapshot.SnapshotID(t.Context(), b) + require.NoError(t, err) + + assert.Equal(t, expectedID, id) +} + +func zipEntryNames(t *testing.T, zipContent []byte) []string { + t.Helper() + r, err := zip.NewReader(bytes.NewReader(zipContent), int64(len(zipContent))) + require.NoError(t, err) + names := make([]string, len(r.File)) + for i, f := range r.File { + names[i] = f.Name + } + return names +} + +func TestBundleZipStripsNotebookExtensions(t *testing.T) { + // Minimal valid Jupyter notebook content. + ipynb := `{"nbformat": 4, "nbformat_minor": 5, "cells": [], "metadata": {}}` + b := makeBundleWithFiles(t, map[string]string{ + "src/my_notebook.ipynb": ipynb, + "src/script.py": "print('hello')", + }) + + zipContent, err := snapshot.BundleZip(t.Context(), b) + require.NoError(t, err) + + names := zipEntryNames(t, zipContent) + assert.True(t, slices.Contains(names, "files/src/my_notebook"), "notebook should have extension stripped") + assert.False(t, slices.Contains(names, "files/src/my_notebook.ipynb"), "notebook should not appear with .ipynb extension") + assert.True(t, slices.Contains(names, "files/src/script.py"), "regular Python file should keep its extension") +} diff --git a/bundle/deploy/snapshot/upload.go b/bundle/deploy/snapshot/upload.go new file mode 100644 index 0000000000..73d84eb970 --- /dev/null +++ b/bundle/deploy/snapshot/upload.go @@ -0,0 +1,69 @@ +package snapshot + +import ( + "context" + "fmt" + "path" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/libs/cmdio" + "github.com/databricks/cli/libs/diag" + "github.com/databricks/cli/libs/filer" + "github.com/databricks/cli/libs/log" +) + +type snapshotUpload struct { + // uploader allows test injection of a custom SnapshotUploader. + uploader filer.SnapshotUploader +} + +// Upload returns a mutator that builds the bundle zip, uploads it via +// /api/2.0/repos/snapshots, and updates workspace.file_path and +// workspace.artifact_path to the content-addressed location returned by the API. +func Upload() bundle.Mutator { + return &snapshotUpload{} +} + +// UploadWithClient returns an upload mutator backed by the provided SnapshotUploader. +// This is intended for use in tests. +func UploadWithClient(uploader filer.SnapshotUploader) bundle.Mutator { + return &snapshotUpload{uploader: uploader} +} + +func (m *snapshotUpload) Name() string { + return "snapshot.Upload" +} + +func (m *snapshotUpload) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { + uploader := m.uploader + if uploader == nil { + var err error + uploader, err = filer.NewSnapshotUploader(b.WorkspaceClient(ctx)) + if err != nil { + return diag.FromErr(err) + } + } + + cmdio.LogString(ctx, "Uploading immutable bundle snapshot...") + + zipContent, err := BundleZip(ctx, b) + if err != nil { + return diag.FromErr(fmt.Errorf("failed to build snapshot zip: %w", err)) + } + snapshotID := IDFromContent(zipContent) + log.Debugf(ctx, "snapshot.Upload: snapshotID=%s zip=%d bytes", snapshotID, len(zipContent)) + + info, err := uploader.Upload(ctx, b.Config.Bundle.Name, snapshotID, b.Config.Workspace.CurrentUser.UserName, zipContent) + if err != nil { + return diag.FromErr(err) + } + + log.Infof(ctx, "Snapshot uploaded to %s", info.Path) + + // The API unpacks the zip under a "src" subdirectory. + b.Config.Workspace.SnapshotPath = info.Path + b.Config.Workspace.FilePath = path.Join(info.Path, "src", "files") + b.Config.Workspace.ArtifactPath = path.Join(info.Path, "src", "artifacts") + + return nil +} diff --git a/bundle/internal/schema/annotations.yml b/bundle/internal/schema/annotations.yml index 2f28ca2759..e63f6625d1 100644 --- a/bundle/internal/schema/annotations.yml +++ b/bundle/internal/schema/annotations.yml @@ -50,6 +50,9 @@ github.com/databricks/cli/bundle/config.Bundle: The Git version control details that are associated with your bundle. "markdown_description": |- The Git version control details that are associated with your bundle. For supported attributes see [\_](/dev-tools/bundles/settings.md#git). + "immutable": + "description": |- + Whether to upload bundle files and artifacts as a single immutable snapshot. When true, all files are packaged into a zip and uploaded via the snapshot API, and workspace.file_path and workspace.artifact_path are set to the returned content-addressed path. The validate and plan commands make no mutative API calls when this is enabled. "name": "description": |- The name of the bundle. diff --git a/bundle/metadata/metadata.go b/bundle/metadata/metadata.go index 1c61cb093f..e554947269 100644 --- a/bundle/metadata/metadata.go +++ b/bundle/metadata/metadata.go @@ -15,6 +15,9 @@ type Bundle struct { type Workspace struct { FilePath string `json:"file_path"` + // SnapshotPath is the workspace path of the immutable snapshot uploaded + // during deployment. Only populated for bundles with bundle.immutable = true. + SnapshotPath string `json:"snapshot_path,omitempty"` } type Resource struct { diff --git a/bundle/phases/build.go b/bundle/phases/build.go index 5a32435f8f..a0649e373f 100644 --- a/bundle/phases/build.go +++ b/bundle/phases/build.go @@ -14,6 +14,7 @@ import ( "github.com/databricks/cli/libs/logdiag" ) + type LibLocationMap map[string][]libraries.LocationToUpdate // The build phase builds artifacts. @@ -24,6 +25,7 @@ func Build(ctx context.Context, b *bundle.Bundle) LibLocationMap { scripts.Execute(config.ScriptPreBuild), artifacts.Build(), scripts.Execute(config.ScriptPostBuild), + mutator.ResolveVariableReferencesWithoutResources( "artifacts", ), @@ -41,6 +43,12 @@ func Build(ctx context.Context, b *bundle.Bundle) LibLocationMap { libraries.SwitchToPatchedWheels(), ) + // For immutable bundles, library remote paths are set in the deploy phase + // after snapshot.Upload() provides the content-addressed workspace.artifact_path. + if b.Config.Bundle.Immutable { + return nil + } + libs, diags := libraries.ReplaceWithRemotePath(ctx, b) for _, diag := range diags { logdiag.LogDiag(ctx, diag) diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index b4d70ede5a..5534354bc6 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -8,10 +8,12 @@ import ( "github.com/databricks/cli/bundle/artifacts" "github.com/databricks/cli/bundle/config" "github.com/databricks/cli/bundle/config/engine" + "github.com/databricks/cli/bundle/config/mutator" "github.com/databricks/cli/bundle/deploy" "github.com/databricks/cli/bundle/deploy/files" "github.com/databricks/cli/bundle/deploy/lock" "github.com/databricks/cli/bundle/deploy/metadata" + "github.com/databricks/cli/bundle/deploy/snapshot" "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/direct" @@ -131,13 +133,38 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand bundle.ApplyContext(ctx, b, lock.Release(lock.GoalDeploy)) }() - uploadLibraries(ctx, b, libs) + if b.Config.Bundle.Immutable { + // Upload all source files and built artifacts as a single immutable snapshot. + // The API assigns a content-addressed path, so workspace.snapshot_path (and + // derived workspace.file_path / workspace.artifact_path) are only known after + // upload. Resolve variable references in resources and set library remote paths + // once the actual paths are available. + bundle.ApplySeqContext(ctx, b, + snapshot.Upload(), + mutator.ResolveVariableReferencesOnlyResources(), + ) + if !logdiag.HasError(ctx) { + _, libDiags := libraries.ReplaceWithRemotePath(ctx, b) + for _, d := range libDiags { + logdiag.LogDiag(ctx, d) + } + } + } else { + uploadLibraries(ctx, b, libs) + } + if logdiag.HasError(ctx) { return } + if !b.Config.Bundle.Immutable { + bundle.ApplySeqContext(ctx, b, files.Upload(outputHandler)) + if logdiag.HasError(ctx) { + return + } + } + bundle.ApplySeqContext(ctx, b, - files.Upload(outputHandler), deploy.StateUpdate(), deploy.StatePush(), permissions.ApplyWorkspaceRootPermissions(), diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index 91640ac6ca..71d5468c25 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -10,6 +10,7 @@ import ( "github.com/databricks/cli/bundle/config/mutator" "github.com/databricks/cli/bundle/deploy/files" "github.com/databricks/cli/bundle/deploy/lock" + deploymetadata "github.com/databricks/cli/bundle/deploy/metadata" "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/direct" @@ -123,17 +124,29 @@ func Destroy(ctx context.Context, b *bundle.Bundle, engine engine.EngineType) { }() if !engine.IsDirect() { - bundle.ApplySeqContext(ctx, b, + mutators := []bundle.Mutator{ // We need to resolve artifact variable (how we do it in build phase) // because some of the to-be-destroyed resource might use this variable. // Not resolving might lead to terraform "Reference to undeclared resource" error mutator.ResolveVariableReferencesWithoutResources("artifacts"), mutator.ResolveVariableReferencesOnlyResources("artifacts"), + } + + if b.Config.Bundle.Immutable { + // For immutable bundles, resource paths contain ${workspace.snapshot_path} + // which was set during deploy by snapshot.Upload(). Load it from the stored + // metadata so it can be resolved before Terraform processes the config. + mutators = append([]bundle.Mutator{deploymetadata.Load()}, mutators...) + mutators = append(mutators, mutator.ResolveVariableReferencesOnlyResources()) + } + mutators = append(mutators, terraform.Interpolate(), terraform.Write(), terraform.Plan(terraform.PlanGoal("destroy")), ) + + bundle.ApplySeqContext(ctx, b, mutators...) } if logdiag.HasError(ctx) { diff --git a/libs/filer/snapshot_client.go b/libs/filer/snapshot_client.go new file mode 100644 index 0000000000..a7d84891cb --- /dev/null +++ b/libs/filer/snapshot_client.go @@ -0,0 +1,103 @@ +package filer + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "mime/multipart" + "net/http" + "net/textproto" + + "github.com/databricks/databricks-sdk-go" + "github.com/databricks/databricks-sdk-go/client" +) + +// SnapshotInfo holds the result of a successful snapshot upload. +type SnapshotInfo struct { + // Path is the immutable workspace path for the uploaded snapshot content. + Path string +} + +// SnapshotUploader abstracts the /api/2.0/repos/snapshots endpoint. +// snapshotID is the content-addressed key supplied by the caller; the API uses +// it as the final path component so that identical content always resolves to +// the same workspace location. +// This interface exists so the implementation can later be replaced with a Go SDK call. +type SnapshotUploader interface { + Upload(ctx context.Context, bundleID, snapshotID, currentUser string, zipContent []byte) (*SnapshotInfo, error) +} + +// snapshotAPIClient implements SnapshotUploader against /api/2.0/repos/snapshots. +type snapshotAPIClient struct { + apiClient apiClient +} + +// snapshotUploadResponse mirrors the /api/2.0/repos/snapshots response body. +type snapshotUploadResponse struct { + Snapshot struct { + Path string `json:"path"` + } `json:"snapshot"` +} + +// NewSnapshotUploader creates a SnapshotUploader backed by /api/2.0/repos/snapshots. +func NewSnapshotUploader(w *databricks.WorkspaceClient) (SnapshotUploader, error) { + apiClient, err := client.New(w.Config) + if err != nil { + return nil, err + } + return &snapshotAPIClient{apiClient: apiClient}, nil +} + +// Upload uploads zipContent as an immutable snapshot identified by snapshotID. +// snapshotID is the SHA-256 of the files-only zip and is used by the server as +// the content-addressed path component. currentUser is granted CAN_READ on the snapshot. +func (c *snapshotAPIClient) Upload(ctx context.Context, bundleID, snapshotID, currentUser string, zipContent []byte) (*SnapshotInfo, error) { + var body bytes.Buffer + mw := multipart.NewWriter(&body) + + if err := mw.WriteField("snapshot_id", snapshotID); err != nil { + return nil, fmt.Errorf("failed to write snapshot_id: %w", err) + } + if err := mw.WriteField("bundle_id", bundleID); err != nil { + return nil, fmt.Errorf("failed to write bundle_id: %w", err) + } + + // The API requires an access_control_list granting the current user read access. + acl, err := json.Marshal([]map[string]string{ + {"user_name": currentUser, "permission_level": "CAN_READ"}, + }) + if err != nil { + return nil, fmt.Errorf("failed to marshal access_control_list: %w", err) + } + if err := mw.WriteField("access_control_list", string(acl)); err != nil { + return nil, fmt.Errorf("failed to write access_control_list: %w", err) + } + + // Attach the zip with an explicit content-type so the server treats it as binary. + fh := make(textproto.MIMEHeader) + fh.Set("Content-Disposition", `form-data; name="file"; filename="snapshot.zip"`) + fh.Set("Content-Type", "application/zip") + part, err := mw.CreatePart(fh) + if err != nil { + return nil, fmt.Errorf("failed to create file part: %w", err) + } + if _, err := part.Write(zipContent); err != nil { + return nil, fmt.Errorf("failed to write zip content: %w", err) + } + if err := mw.Close(); err != nil { + return nil, fmt.Errorf("failed to finalize multipart body: %w", err) + } + + headers := map[string]string{ + "Content-Type": mw.FormDataContentType(), + } + + var resp snapshotUploadResponse + err = c.apiClient.Do(ctx, http.MethodPost, "/api/2.0/repos/snapshots", headers, nil, body.Bytes(), &resp) + if err != nil { + return nil, fmt.Errorf("snapshot upload: %w", err) + } + + return &SnapshotInfo{Path: resp.Snapshot.Path}, nil +}