From dd22437cf42a65b5a467c90e1c03f08b4a63ec28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Thu, 11 Jun 2026 10:33:51 +0200 Subject: [PATCH 01/16] PMM-14576 Initial improvements. --- agent/runner/jobs/pbm_helpers.go | 275 ++++++++++++++++++++------ agent/runner/jobs/pbm_helpers_test.go | 104 ++++++++++ 2 files changed, 317 insertions(+), 62 deletions(-) diff --git a/agent/runner/jobs/pbm_helpers.go b/agent/runner/jobs/pbm_helpers.go index e8ec35ca55e..d2f5a928486 100644 --- a/agent/runner/jobs/pbm_helpers.go +++ b/agent/runner/jobs/pbm_helpers.go @@ -35,6 +35,18 @@ const ( resyncTimeout = 5 * time.Minute statusCheckInterval = 5 * time.Second maxRestoreChecks = 100 + + maxDescribeCommandRetries = 5 + // PBM waits up to ~33s for backup metadata; allow extra margin for describe-backup. + pbmDescribeStartupGrace = 60 * time.Second + + pbmCmdBackup = "backup" + pbmCmdRestore = "restore" + + pbmStatusDone = "done" + pbmStatusCanceled = "canceled" + pbmStatusError = "error" + pbmStatusPartlyDone = "partlyDone" ) type pbmSeverity int @@ -48,6 +60,8 @@ type describeInfo struct { type replSet struct { Name string `json:"name"` Status string `json:"status"` + Error string `json:"error,omitempty"` + Node string `json:"node,omitempty"` Nodes []node `json:"nodes"` } @@ -254,44 +268,185 @@ func getPBMStatus(ctx context.Context, dsn string) (*pbmStatus, error) { return &status, nil } +type pbmDescribePollConfig struct { + l logrus.FieldLogger + dsn string + operation string + targetName string + startedAt time.Time + describeRetries *int + fetchDescribe func(context.Context) (describeInfo, error) + isRunning func(*pbmStatus) bool + findSnapshot func(*pbmStatus) *pbmSnapshot +} + func waitForPBMBackup(ctx context.Context, l logrus.FieldLogger, dsn string, name string) error { l.Infof("waiting for pbm backup: %s", name) + + describeRetries := maxDescribeCommandRetries + return waitForPBMDescribe(ctx, pbmDescribePollConfig{ + l: l, + dsn: dsn, + operation: "backup", + targetName: name, + startedAt: time.Now(), + describeRetries: &describeRetries, + fetchDescribe: func(ctx context.Context) (describeInfo, error) { + var info describeInfo + err := execPBMCommand(ctx, dsn, &info, "describe-backup", name) + return info, err + }, + isRunning: func(status *pbmStatus) bool { + return isPBMBackupRunning(status, name) + }, + findSnapshot: func(status *pbmStatus) *pbmSnapshot { + return findPBMSnapshot(status, name) + }, + }) +} + +func waitForPBMDescribe(ctx context.Context, cfg pbmDescribePollConfig) error { ticker := time.NewTicker(statusCheckInterval) defer ticker.Stop() - retryCount := 500 - for { select { case <-ticker.C: - var info describeInfo - err := execPBMCommand(ctx, dsn, &info, "describe-backup", name) + done, err := pollPBMDescribeOnce(ctx, cfg) if err != nil { - // for the first couple of seconds after backup process starts describe-backup command may return this error - if (strings.HasSuffix(err.Error(), "no such file") || - strings.HasSuffix(err.Error(), "file is empty")) && retryCount > 0 { - retryCount-- - continue - } - - return errors.Wrap(err, "failed to get backup status") + return err } - - switch info.Status { - case "done": + if done { return nil - case "canceled": - return errors.New("backup was canceled") - case "error": - return errors.New(info.Error) } - case <-ctx.Done(): return ctx.Err() } } } +func pollPBMDescribeOnce(ctx context.Context, cfg pbmDescribePollConfig) (bool, error) { + info, describeErr := cfg.fetchDescribe(ctx) + if describeErr == nil { + *cfg.describeRetries = maxDescribeCommandRetries + return describeTerminalError(info, cfg.operation) + } + + status, statusErr := getPBMStatus(ctx, cfg.dsn) + if statusErr != nil { + return false, errors.Wrap(statusErr, "failed to get pbm status") + } + + if cfg.isRunning(status) { + if shouldRetryDescribeFailure(describeErr, cfg.startedAt) { + cfg.l.Debugf("describe-%s transient error while %s is running: %s", cfg.operation, cfg.operation, describeErr) + return false, nil + } + if retryDescribeCommand(cfg, describeErr) { + return false, nil + } + } + + if cfg.findSnapshot != nil { + if snapshot := cfg.findSnapshot(status); snapshot != nil { + done, err := snapshotTerminalError(snapshot, cfg.operation) + if done { + return true, err + } + } + } + + if shouldRetryDescribeFailure(describeErr, cfg.startedAt) { + return false, nil + } + if retryDescribeCommand(cfg, describeErr) { + return false, nil + } + + return false, errors.Wrapf(describeErr, "failed to get %s status", cfg.operation) +} + +func shouldRetryDescribeFailure(err error, startedAt time.Time) bool { + return isTransientPBMDescribeError(err) && time.Since(startedAt) < pbmDescribeStartupGrace +} + +func retryDescribeCommand(cfg pbmDescribePollConfig, err error) bool { + if *cfg.describeRetries <= 0 { + return false + } + *cfg.describeRetries-- + cfg.l.Warnf("describe-%s failed and will retry: %s", cfg.operation, err) + return true +} + +func isTransientPBMDescribeError(err error) bool { + if err == nil { + return false + } + msg := strings.ToLower(err.Error()) + return strings.Contains(msg, "no such file") || + strings.Contains(msg, "file is empty") || + strings.Contains(msg, "not found") || + strings.Contains(msg, "get backup meta") || + strings.Contains(msg, "get snapshot size") || + strings.Contains(msg, "missed file") +} + +func isPBMBackupRunning(status *pbmStatus, name string) bool { + return status.Running.Type == pbmCmdBackup && status.Running.Name == name +} + +func isPBMRestoreRunning(status *pbmStatus) bool { + return status.Running.Type == pbmCmdRestore +} + +func findPBMSnapshot(status *pbmStatus, name string) *pbmSnapshot { + for i := range status.Backups.Snapshot { + if status.Backups.Snapshot[i].Name == name { + return &status.Backups.Snapshot[i] + } + } + return nil +} + +func describeTerminalError(info describeInfo, operation string) (bool, error) { + switch info.Status { + case pbmStatusDone: + return true, nil + case pbmStatusCanceled: + return true, errors.Errorf("%s was canceled", operation) + case pbmStatusError: + return true, describeFailureError(info, operation) + case pbmStatusPartlyDone: + return true, groupDescribeErrors(info) + default: + return false, nil + } +} + +func snapshotTerminalError(snapshot *pbmSnapshot, operation string) (bool, error) { + switch snapshot.Status { + case pbmStatusDone: + return true, nil + case pbmStatusCanceled: + return true, errors.Errorf("%s was canceled", operation) + case pbmStatusError: + if snapshot.Error != "" { + return true, errors.New(snapshot.Error) + } + return true, errors.Errorf("%s failed", operation) + default: + return false, nil + } +} + +func describeFailureError(info describeInfo, operation string) error { + if err := groupDescribeErrors(info); err != nil && err.Error() != "operation failed" { + return err + } + return errors.Errorf("%s failed", operation) +} + func findPITRRestore(list []pbmListRestore, restoreInfoPITRTime int64, startedAt time.Time) *pbmListRestore { for _, v := range slices.Backward(list) { // TODO when PITR restore invoked with wrong timestamp pbm marks this restore operation as "snapshot" type. @@ -365,47 +520,28 @@ func waitForPBMRestore(ctx context.Context, l logrus.FieldLogger, dsn string, re l.Infof("waiting for pbm restore: %s", name) - ticker := time.NewTicker(statusCheckInterval) - defer ticker.Stop() - - maxRetryCount := 5 - for { - select { - case <-ticker.C: + describeRetries := maxDescribeCommandRetries + return waitForPBMDescribe(ctx, pbmDescribePollConfig{ + l: l, + dsn: dsn, + operation: "restore", + targetName: name, + startedAt: time.Now(), + describeRetries: &describeRetries, + fetchDescribe: func(ctx context.Context) (describeInfo, error) { var info describeInfo + var describeErr error if backupType == "physical" { - err = execPBMCommand(ctx, dsn, &info, "describe-restore", name, "--config="+confFile) + describeErr = execPBMCommand(ctx, dsn, &info, "describe-restore", name, "--config="+confFile) } else { - err = execPBMCommand(ctx, dsn, &info, "describe-restore", name) - } - if err != nil { - if maxRetryCount > 0 { - maxRetryCount-- - l.Warnf("PMM failed to get backup restore status and will retry: %s", err) - continue - } else { //nolint:revive - return errors.Wrap(err, "failed to get restore status") - } - } - // reset maxRetryCount if we were able to successfully get the current restore status - maxRetryCount = 5 - - switch info.Status { - case "done": - return nil - case "canceled": - return errors.New("restore was canceled") - case "error": - return errors.New(info.Error) - // We consider partlyDone as an error because we cannot automatically recover cluster from this status to fully working. - case "partlyDone": - return groupPartlyDoneErrors(info) + describeErr = execPBMCommand(ctx, dsn, &info, "describe-restore", name) } - - case <-ctx.Done(): - return ctx.Err() - } - } + return info, describeErr + }, + isRunning: func(status *pbmStatus) bool { + return isPBMRestoreRunning(status) + }, + }) } func pbmConfigure(ctx context.Context, l logrus.FieldLogger, params pbmConfigParams) error { @@ -538,21 +674,36 @@ func createPBMConfig(locationConfig *BackupLocationConfig, prefix string, pitr b return conf, nil } -func groupPartlyDoneErrors(info describeInfo) error { +func groupDescribeErrors(info describeInfo) error { var errMsgs []string + if info.Error != "" { + errMsgs = append(errMsgs, info.Error) + } + for _, rs := range info.ReplSets { - if rs.Status == "partlyDone" { - for _, node := range rs.Nodes { - if node.Status == "error" { - errMsgs = append(errMsgs, fmt.Sprintf("replset: %s, node: %s, error: %s", rs.Name, node.Name, node.Error)) + if rs.Error != "" { + errMsgs = append(errMsgs, fmt.Sprintf("replset: %s, error: %s", rs.Name, rs.Error)) + } + if rs.Status == pbmStatusPartlyDone { + for _, n := range rs.Nodes { + if n.Status == pbmStatusError { + errMsgs = append(errMsgs, fmt.Sprintf("replset: %s, node: %s, error: %s", rs.Name, n.Name, n.Error)) } } } } + + if len(errMsgs) == 0 { + return errors.New("operation failed") + } return errors.New(strings.Join(errMsgs, "; ")) } +func groupPartlyDoneErrors(info describeInfo) error { + return groupDescribeErrors(info) +} + // pbmGetSnapshotTimestamp returns time the backup restores target db to. func pbmGetSnapshotTimestamp(ctx context.Context, l logrus.FieldLogger, dsn string, backupName string) (*time.Time, error) { snapshots, err := getSnapshots(ctx, l, dsn) diff --git a/agent/runner/jobs/pbm_helpers_test.go b/agent/runner/jobs/pbm_helpers_test.go index ccc1031da59..883cab19418 100644 --- a/agent/runner/jobs/pbm_helpers_test.go +++ b/agent/runner/jobs/pbm_helpers_test.go @@ -18,6 +18,7 @@ import ( "testing" "time" + "github.com/pkg/errors" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -126,6 +127,109 @@ func TestCreatePBMConfig(t *testing.T) { } } +func TestIsTransientPBMDescribeError(t *testing.T) { + t.Parallel() + + for _, tc := range []struct { + name string + err error + want bool + }{ + { + name: "no such file", + err: errors.New(`get file 2024-01-01T00:00:00Z/rs0/metadata.json: no such file`), + want: true, + }, + { + name: "file is empty", + err: errors.New("get file foo: file is empty"), + want: true, + }, + { + name: "backup meta not found", + err: errors.New("get backup meta: not found"), + want: true, + }, + { + name: "real failure", + err: errors.New("permission denied"), + want: false, + }, + } { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + assert.Equal(t, tc.want, isTransientPBMDescribeError(tc.err)) + }) + } +} + +func TestDescribeTerminalError(t *testing.T) { + t.Parallel() + + done, err := describeTerminalError(describeInfo{Status: pbmStatusDone}, "backup") + require.NoError(t, err) + assert.True(t, done) + + done, err = describeTerminalError(describeInfo{Status: pbmStatusCanceled}, "backup") + require.EqualError(t, err, "backup was canceled") + assert.True(t, done) + + done, err = describeTerminalError(describeInfo{Status: pbmStatusError, Error: "oplog has insufficient range"}, "backup") + require.EqualError(t, err, "oplog has insufficient range") + assert.True(t, done) + + done, err = describeTerminalError(describeInfo{Status: "running"}, "backup") + require.NoError(t, err) + assert.False(t, done) +} + +func TestSnapshotTerminalError(t *testing.T) { + t.Parallel() + + done, err := snapshotTerminalError(&pbmSnapshot{Status: pbmStatusDone}, "backup") + require.NoError(t, err) + assert.True(t, done) + + done, err = snapshotTerminalError(&pbmSnapshot{Status: pbmStatusError, Error: "storage unavailable"}, "backup") + require.EqualError(t, err, "storage unavailable") + assert.True(t, done) + + done, err = snapshotTerminalError(&pbmSnapshot{Status: "running"}, "backup") + require.NoError(t, err) + assert.False(t, done) +} + +func TestFindPBMSnapshot(t *testing.T) { + t.Parallel() + + status := &pbmStatus{} + status.Backups.Snapshot = []pbmSnapshot{ + {Name: "2024-01-01T00:00:00Z", Status: pbmStatusDone}, + {Name: "2024-01-02T00:00:00Z", Status: pbmStatusError, Error: "failed"}, + } + + assert.Nil(t, findPBMSnapshot(status, "missing")) + require.NotNil(t, findPBMSnapshot(status, "2024-01-02T00:00:00Z")) +} + +func TestGroupDescribeErrors(t *testing.T) { + t.Parallel() + + err := groupDescribeErrors(describeInfo{ + Status: pbmStatusPartlyDone, + ReplSets: []replSet{{ + Name: "rs0", + Status: pbmStatusPartlyDone, + Nodes: []node{{ + Name: "node1", + Status: pbmStatusError, + Error: "copy failed", + }}, + }}, + }) + require.EqualError(t, err, "replset: rs0, node: node1, error: copy failed") +} + func TestFindPITRRestore(t *testing.T) { // Tested func searches from the end, so we place records to be skipped at the end. testList := []pbmListRestore{ From 33e102212f17de7305d06e6ba386e14b02c98e69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Fri, 12 Jun 2026 09:58:11 +0200 Subject: [PATCH 02/16] PMM-14576 Increase coverage. --- agent/runner/jobs/pbm_helpers.go | 34 +- agent/runner/jobs/pbm_helpers_test.go | 475 ++++++++++++++++++++++++++ 2 files changed, 498 insertions(+), 11 deletions(-) diff --git a/agent/runner/jobs/pbm_helpers.go b/agent/runner/jobs/pbm_helpers.go index d2f5a928486..b00c277fcb4 100644 --- a/agent/runner/jobs/pbm_helpers.go +++ b/agent/runner/jobs/pbm_helpers.go @@ -269,17 +269,24 @@ func getPBMStatus(ctx context.Context, dsn string) (*pbmStatus, error) { } type pbmDescribePollConfig struct { - l logrus.FieldLogger - dsn string - operation string - targetName string - startedAt time.Time - describeRetries *int - fetchDescribe func(context.Context) (describeInfo, error) - isRunning func(*pbmStatus) bool - findSnapshot func(*pbmStatus) *pbmSnapshot + l logrus.FieldLogger + dsn string + operation string + targetName string + startedAt time.Time + describeRetries *int + fetchDescribe func(context.Context) (describeInfo, error) + fetchStatus func(context.Context, string) (*pbmStatus, error) + isRunning func(*pbmStatus) bool + findSnapshot func(*pbmStatus) *pbmSnapshot } +// pbmDescribePollInterval is used by waitForPBMDescribe; tests may override it. +var pbmDescribePollInterval = statusCheckInterval + +// pbmStatusFetcher fetches PBM status; tests may override it. +var pbmStatusFetcher = getPBMStatus + func waitForPBMBackup(ctx context.Context, l logrus.FieldLogger, dsn string, name string) error { l.Infof("waiting for pbm backup: %s", name) @@ -306,7 +313,7 @@ func waitForPBMBackup(ctx context.Context, l logrus.FieldLogger, dsn string, nam } func waitForPBMDescribe(ctx context.Context, cfg pbmDescribePollConfig) error { - ticker := time.NewTicker(statusCheckInterval) + ticker := time.NewTicker(pbmDescribePollInterval) defer ticker.Stop() for { @@ -332,7 +339,12 @@ func pollPBMDescribeOnce(ctx context.Context, cfg pbmDescribePollConfig) (bool, return describeTerminalError(info, cfg.operation) } - status, statusErr := getPBMStatus(ctx, cfg.dsn) + fetchStatus := cfg.fetchStatus + if fetchStatus == nil { + fetchStatus = pbmStatusFetcher + } + + status, statusErr := fetchStatus(ctx, cfg.dsn) if statusErr != nil { return false, errors.Wrap(statusErr, "failed to get pbm status") } diff --git a/agent/runner/jobs/pbm_helpers_test.go b/agent/runner/jobs/pbm_helpers_test.go index 883cab19418..06bceeb5e38 100644 --- a/agent/runner/jobs/pbm_helpers_test.go +++ b/agent/runner/jobs/pbm_helpers_test.go @@ -15,14 +15,51 @@ package jobs import ( + "context" + "os" "testing" "time" "github.com/pkg/errors" + "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) +func withFastPBMDescribePoll(t *testing.T) { + t.Helper() + + old := pbmDescribePollInterval + pbmDescribePollInterval = time.Millisecond + t.Cleanup(func() { + pbmDescribePollInterval = old + }) +} + +func newTestDescribePollConfig(t *testing.T, opts ...func(*pbmDescribePollConfig)) pbmDescribePollConfig { + t.Helper() + + retries := maxDescribeCommandRetries + cfg := pbmDescribePollConfig{ + l: logrus.New(), + dsn: "mongodb://localhost", + operation: "backup", + targetName: "2024-01-01T00:00:00Z", + startedAt: time.Now(), + describeRetries: &retries, + fetchDescribe: func(context.Context) (describeInfo, error) { + return describeInfo{Status: pbmStatusDone}, nil + }, + isRunning: func(*pbmStatus) bool { + return false + }, + } + for _, opt := range opts { + opt(&cfg) + } + return cfg +} + func TestCreatePBMConfig(t *testing.T) { s3Config := S3LocationConfig{ Endpoint: "test_endpoint", @@ -130,6 +167,8 @@ func TestCreatePBMConfig(t *testing.T) { func TestIsTransientPBMDescribeError(t *testing.T) { t.Parallel() + assert.False(t, isTransientPBMDescribeError(nil)) + for _, tc := range []struct { name string err error @@ -150,6 +189,11 @@ func TestIsTransientPBMDescribeError(t *testing.T) { err: errors.New("get backup meta: not found"), want: true, }, + { + name: "get snapshot size", + err: errors.New("get snapshot size: missed file"), + want: true, + }, { name: "real failure", err: errors.New("permission denied"), @@ -163,6 +207,412 @@ func TestIsTransientPBMDescribeError(t *testing.T) { } } +func TestIsPBMBackupRunning(t *testing.T) { + t.Parallel() + + status := &pbmStatus{} + status.Running.Type = pbmCmdBackup + status.Running.Name = "backup-1" + + assert.True(t, isPBMBackupRunning(status, "backup-1")) + assert.False(t, isPBMBackupRunning(status, "backup-2")) + assert.False(t, isPBMBackupRunning(&pbmStatus{}, "backup-1")) +} + +func TestIsPBMRestoreRunning(t *testing.T) { + t.Parallel() + + status := &pbmStatus{} + status.Running.Type = pbmCmdRestore + assert.True(t, isPBMRestoreRunning(status)) + assert.False(t, isPBMRestoreRunning(&pbmStatus{})) +} + +func TestShouldRetryDescribeFailure(t *testing.T) { + t.Parallel() + + transientErr := errors.New("no such file") + assert.True(t, shouldRetryDescribeFailure(transientErr, time.Now())) + assert.False(t, shouldRetryDescribeFailure(transientErr, time.Now().Add(-pbmDescribeStartupGrace))) + assert.False(t, shouldRetryDescribeFailure(errors.New("permission denied"), time.Now())) +} + +func TestRetryDescribeCommand(t *testing.T) { + t.Parallel() + + retries := 1 + cfg := pbmDescribePollConfig{ + l: logrus.New(), + operation: "backup", + describeRetries: &retries, + } + + assert.True(t, retryDescribeCommand(cfg, errors.New("temporary"))) + assert.Equal(t, 0, retries) + assert.False(t, retryDescribeCommand(cfg, errors.New("temporary"))) +} + +func TestDescribeFailureError(t *testing.T) { + t.Parallel() + + err := describeFailureError(describeInfo{Status: pbmStatusError}, "backup") + require.EqualError(t, err, "backup failed") + + err = describeFailureError(describeInfo{Status: pbmStatusError, Error: "oplog gap"}, "backup") + require.EqualError(t, err, "oplog gap") +} + +func TestGroupDescribeErrors_AllBranches(t *testing.T) { + t.Parallel() + + err := groupDescribeErrors(describeInfo{}) + require.EqualError(t, err, "operation failed") + + err = groupDescribeErrors(describeInfo{Error: "top level"}) + require.EqualError(t, err, "top level") + + err = groupDescribeErrors(describeInfo{ + ReplSets: []replSet{{Name: "rs0", Error: "rs failed"}}, + }) + require.EqualError(t, err, "replset: rs0, error: rs failed") +} + +func TestGroupPartlyDoneErrors(t *testing.T) { + t.Parallel() + + err := groupPartlyDoneErrors(describeInfo{ + Status: pbmStatusPartlyDone, + ReplSets: []replSet{{ + Name: "rs0", + Status: pbmStatusPartlyDone, + Nodes: []node{{ + Name: "node1", + Status: pbmStatusError, + Error: "copy failed", + }}, + }}, + }) + require.EqualError(t, err, "replset: rs0, node: node1, error: copy failed") +} + +func TestPollPBMDescribeOnce(t *testing.T) { + t.Parallel() + + t.Run("describe done", func(t *testing.T) { + t.Parallel() + done, err := pollPBMDescribeOnce(context.Background(), newTestDescribePollConfig(t)) + require.NoError(t, err) + assert.True(t, done) + }) + + t.Run("describe in progress", func(t *testing.T) { + t.Parallel() + cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + c.fetchDescribe = func(context.Context) (describeInfo, error) { + return describeInfo{Status: "running"}, nil + } + }) + done, err := pollPBMDescribeOnce(context.Background(), cfg) + require.NoError(t, err) + assert.False(t, done) + }) + + t.Run("describe canceled", func(t *testing.T) { + t.Parallel() + cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + c.fetchDescribe = func(context.Context) (describeInfo, error) { + return describeInfo{Status: pbmStatusCanceled}, nil + } + }) + done, err := pollPBMDescribeOnce(context.Background(), cfg) + require.EqualError(t, err, "backup was canceled") + assert.True(t, done) + }) + + t.Run("describe partly done", func(t *testing.T) { + t.Parallel() + cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + c.fetchDescribe = func(context.Context) (describeInfo, error) { + return describeInfo{ + Status: pbmStatusPartlyDone, + ReplSets: []replSet{{ + Name: "rs0", + Status: pbmStatusPartlyDone, + Nodes: []node{{ + Name: "node1", + Status: pbmStatusError, + Error: "failed node", + }}, + }}, + }, nil + } + }) + done, err := pollPBMDescribeOnce(context.Background(), cfg) + require.EqualError(t, err, "replset: rs0, node: node1, error: failed node") + assert.True(t, done) + }) + + t.Run("status fetch error", func(t *testing.T) { + t.Parallel() + cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + c.fetchDescribe = func(context.Context) (describeInfo, error) { + return describeInfo{}, errors.New("describe failed") + } + c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { + return nil, errors.New("status unavailable") + } + }) + done, err := pollPBMDescribeOnce(context.Background(), cfg) + require.ErrorContains(t, err, "failed to get pbm status") + assert.False(t, done) + }) + + t.Run("running backup with transient describe error", func(t *testing.T) { + t.Parallel() + cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + c.fetchDescribe = func(context.Context) (describeInfo, error) { + return describeInfo{}, errors.New("no such file") + } + c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { + status := &pbmStatus{} + status.Running.Type = pbmCmdBackup + status.Running.Name = c.targetName + return status, nil + } + c.isRunning = func(status *pbmStatus) bool { + return isPBMBackupRunning(status, c.targetName) + } + }) + done, err := pollPBMDescribeOnce(context.Background(), cfg) + require.NoError(t, err) + assert.False(t, done) + }) + + t.Run("running backup retries describe command", func(t *testing.T) { + t.Parallel() + retries := maxDescribeCommandRetries + cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) + c.describeRetries = &retries + c.fetchDescribe = func(context.Context) (describeInfo, error) { + return describeInfo{}, errors.New("permission denied") + } + c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { + status := &pbmStatus{} + status.Running.Type = pbmCmdBackup + status.Running.Name = c.targetName + return status, nil + } + c.isRunning = func(status *pbmStatus) bool { + return isPBMBackupRunning(status, c.targetName) + } + }) + done, err := pollPBMDescribeOnce(context.Background(), cfg) + require.NoError(t, err) + assert.False(t, done) + assert.Equal(t, maxDescribeCommandRetries-1, retries) + }) + + t.Run("snapshot done when describe fails", func(t *testing.T) { + t.Parallel() + cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + c.fetchDescribe = func(context.Context) (describeInfo, error) { + return describeInfo{}, errors.New("no such file") + } + c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) + c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { + status := &pbmStatus{} + status.Backups.Snapshot = []pbmSnapshot{{ + Name: c.targetName, + Status: pbmStatusDone, + }} + return status, nil + } + c.findSnapshot = func(status *pbmStatus) *pbmSnapshot { + return findPBMSnapshot(status, c.targetName) + } + }) + done, err := pollPBMDescribeOnce(context.Background(), cfg) + require.NoError(t, err) + assert.True(t, done) + }) + + t.Run("snapshot terminal error", func(t *testing.T) { + t.Parallel() + cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + c.fetchDescribe = func(context.Context) (describeInfo, error) { + return describeInfo{}, errors.New("no such file") + } + c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) + c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { + status := &pbmStatus{} + status.Backups.Snapshot = []pbmSnapshot{{ + Name: c.targetName, + Status: pbmStatusError, + Error: "storage error", + }} + return status, nil + } + c.findSnapshot = func(status *pbmStatus) *pbmSnapshot { + return findPBMSnapshot(status, c.targetName) + } + }) + done, err := pollPBMDescribeOnce(context.Background(), cfg) + require.EqualError(t, err, "storage error") + assert.True(t, done) + }) + + t.Run("startup grace for transient error", func(t *testing.T) { + t.Parallel() + cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + c.fetchDescribe = func(context.Context) (describeInfo, error) { + return describeInfo{}, errors.New("file is empty") + } + c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { + return &pbmStatus{}, nil + } + }) + done, err := pollPBMDescribeOnce(context.Background(), cfg) + require.NoError(t, err) + assert.False(t, done) + }) + + t.Run("uses default status fetcher", func(t *testing.T) { + t.Parallel() + + oldFetcher := pbmStatusFetcher + pbmStatusFetcher = func(context.Context, string) (*pbmStatus, error) { + return &pbmStatus{}, nil + } + t.Cleanup(func() { + pbmStatusFetcher = oldFetcher + }) + + retries := 0 + cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) + c.describeRetries = &retries + c.fetchStatus = nil + c.fetchDescribe = func(context.Context) (describeInfo, error) { + return describeInfo{}, errors.New("permission denied") + } + }) + done, err := pollPBMDescribeOnce(context.Background(), cfg) + require.ErrorContains(t, err, "failed to get backup status") + assert.False(t, done) + }) + + t.Run("retries after startup grace when retries remain", func(t *testing.T) { + t.Parallel() + retries := 2 + cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) + c.describeRetries = &retries + c.fetchDescribe = func(context.Context) (describeInfo, error) { + return describeInfo{}, errors.New("file is empty") + } + c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { + return &pbmStatus{}, nil + } + }) + done, err := pollPBMDescribeOnce(context.Background(), cfg) + require.NoError(t, err) + assert.False(t, done) + assert.Equal(t, 1, retries) + }) + + t.Run("running backup with exhausted retries returns error", func(t *testing.T) { + t.Parallel() + retries := 0 + cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) + c.describeRetries = &retries + c.fetchDescribe = func(context.Context) (describeInfo, error) { + return describeInfo{}, errors.New("permission denied") + } + c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { + status := &pbmStatus{} + status.Running.Type = pbmCmdBackup + status.Running.Name = c.targetName + return status, nil + } + c.isRunning = func(status *pbmStatus) bool { + return isPBMBackupRunning(status, c.targetName) + } + }) + done, err := pollPBMDescribeOnce(context.Background(), cfg) + require.ErrorContains(t, err, "failed to get backup status") + assert.False(t, done) + }) + + t.Run("describe failure without running backup", func(t *testing.T) { + t.Parallel() + retries := 0 + cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) + c.describeRetries = &retries + c.fetchDescribe = func(context.Context) (describeInfo, error) { + return describeInfo{}, errors.New("permission denied") + } + c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { + return &pbmStatus{}, nil + } + }) + done, err := pollPBMDescribeOnce(context.Background(), cfg) + require.ErrorContains(t, err, "failed to get backup status") + assert.False(t, done) + }) +} + +func TestWaitForPBMDescribe(t *testing.T) { + withFastPBMDescribePoll(t) + + t.Run("completes when describe reports done", func(t *testing.T) { + err := waitForPBMDescribe(context.Background(), newTestDescribePollConfig(t)) + require.NoError(t, err) + }) + + t.Run("returns describe error", func(t *testing.T) { + cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + c.fetchDescribe = func(context.Context) (describeInfo, error) { + return describeInfo{Status: pbmStatusCanceled}, nil + } + }) + err := waitForPBMDescribe(context.Background(), cfg) + require.EqualError(t, err, "backup was canceled") + }) + + t.Run("context canceled", func(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() + err := waitForPBMDescribe(ctx, newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + c.fetchDescribe = func(context.Context) (describeInfo, error) { + return describeInfo{Status: "running"}, nil + } + })) + require.ErrorIs(t, err, context.Canceled) + }) +} + +func TestWritePBMConfigFile(t *testing.T) { + t.Parallel() + + conf, err := createPBMConfig(&BackupLocationConfig{ + Type: FilesystemBackupLocationType, + FilesystemStorageConfig: &FilesystemBackupLocationConfig{ + Path: "/tmp/pbm", + }, + }, "artifact", false) + require.NoError(t, err) + + path, err := writePBMConfigFile(conf) + require.NoError(t, err) + t.Cleanup(func() { + require.NoError(t, os.Remove(path)) + }) + require.FileExists(t, path) +} + func TestDescribeTerminalError(t *testing.T) { t.Parallel() @@ -178,6 +628,10 @@ func TestDescribeTerminalError(t *testing.T) { require.EqualError(t, err, "oplog has insufficient range") assert.True(t, done) + done, err = describeTerminalError(describeInfo{Status: pbmStatusPartlyDone, Error: "partial"}, "backup") + require.EqualError(t, err, "partial") + assert.True(t, done) + done, err = describeTerminalError(describeInfo{Status: "running"}, "backup") require.NoError(t, err) assert.False(t, done) @@ -190,10 +644,18 @@ func TestSnapshotTerminalError(t *testing.T) { require.NoError(t, err) assert.True(t, done) + done, err = snapshotTerminalError(&pbmSnapshot{Status: pbmStatusCanceled}, "backup") + require.EqualError(t, err, "backup was canceled") + assert.True(t, done) + done, err = snapshotTerminalError(&pbmSnapshot{Status: pbmStatusError, Error: "storage unavailable"}, "backup") require.EqualError(t, err, "storage unavailable") assert.True(t, done) + done, err = snapshotTerminalError(&pbmSnapshot{Status: pbmStatusError}, "backup") + require.EqualError(t, err, "backup failed") + assert.True(t, done) + done, err = snapshotTerminalError(&pbmSnapshot{Status: "running"}, "backup") require.NoError(t, err) assert.False(t, done) @@ -230,6 +692,19 @@ func TestGroupDescribeErrors(t *testing.T) { require.EqualError(t, err, "replset: rs0, node: node1, error: copy failed") } +func TestFindPITRRestoreSkipsInvalidEntries(t *testing.T) { + t.Parallel() + + startedAt, err := time.Parse(time.RFC3339Nano, "2022-10-11T14:53:20.000000000Z") + require.NoError(t, err) + + list := []pbmListRestore{ + {Name: "invalid-name", Type: "pitr", PITR: 1000000000}, + {Name: "2022-10-11T14:53:20.000000001Z", Type: "snapshot", Snapshot: "snap"}, + } + assert.Nil(t, findPITRRestore(list, 1000000000, startedAt)) +} + func TestFindPITRRestore(t *testing.T) { // Tested func searches from the end, so we place records to be skipped at the end. testList := []pbmListRestore{ From 035098483bf095bec54ea37ef01f6111fbf30d89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Fri, 12 Jun 2026 10:22:47 +0200 Subject: [PATCH 03/16] PMM-14576 Lint, changes. --- agent/runner/jobs/pbm_helpers.go | 52 +++++++++++++-------- agent/runner/jobs/pbm_helpers_test.go | 67 +++++++++++++++++++++------ 2 files changed, 85 insertions(+), 34 deletions(-) diff --git a/agent/runner/jobs/pbm_helpers.go b/agent/runner/jobs/pbm_helpers.go index b00c277fcb4..82bd4dd3ecb 100644 --- a/agent/runner/jobs/pbm_helpers.go +++ b/agent/runner/jobs/pbm_helpers.go @@ -294,7 +294,7 @@ func waitForPBMBackup(ctx context.Context, l logrus.FieldLogger, dsn string, nam return waitForPBMDescribe(ctx, pbmDescribePollConfig{ l: l, dsn: dsn, - operation: "backup", + operation: pbmCmdBackup, targetName: name, startedAt: time.Now(), describeRetries: &describeRetries, @@ -303,12 +303,6 @@ func waitForPBMBackup(ctx context.Context, l logrus.FieldLogger, dsn string, nam err := execPBMCommand(ctx, dsn, &info, "describe-backup", name) return info, err }, - isRunning: func(status *pbmStatus) bool { - return isPBMBackupRunning(status, name) - }, - findSnapshot: func(status *pbmStatus) *pbmSnapshot { - return findPBMSnapshot(status, name) - }, }) } @@ -349,7 +343,7 @@ func pollPBMDescribeOnce(ctx context.Context, cfg pbmDescribePollConfig) (bool, return false, errors.Wrap(statusErr, "failed to get pbm status") } - if cfg.isRunning(status) { + if cfg.operationIsRunning(status) { if shouldRetryDescribeFailure(describeErr, cfg.startedAt) { cfg.l.Debugf("describe-%s transient error while %s is running: %s", cfg.operation, cfg.operation, describeErr) return false, nil @@ -359,12 +353,10 @@ func pollPBMDescribeOnce(ctx context.Context, cfg pbmDescribePollConfig) (bool, } } - if cfg.findSnapshot != nil { - if snapshot := cfg.findSnapshot(status); snapshot != nil { - done, err := snapshotTerminalError(snapshot, cfg.operation) - if done { - return true, err - } + if snapshot := cfg.snapshotForTarget(status); snapshot != nil { + done, err := snapshotTerminalError(snapshot, cfg.operation) + if done { + return true, err } } @@ -404,6 +396,30 @@ func isTransientPBMDescribeError(err error) bool { strings.Contains(msg, "missed file") } +func (cfg pbmDescribePollConfig) operationIsRunning(status *pbmStatus) bool { + if cfg.isRunning != nil { + return cfg.isRunning(status) + } + switch cfg.operation { + case pbmCmdBackup: + return isPBMBackupRunning(status, cfg.targetName) + case pbmCmdRestore: + return isPBMRestoreRunning(status) + default: + return false + } +} + +func (cfg pbmDescribePollConfig) snapshotForTarget(status *pbmStatus) *pbmSnapshot { + if cfg.findSnapshot != nil { + return cfg.findSnapshot(status) + } + if cfg.operation == pbmCmdBackup { + return findPBMSnapshot(status, cfg.targetName) + } + return nil +} + func isPBMBackupRunning(status *pbmStatus, name string) bool { return status.Running.Type == pbmCmdBackup && status.Running.Name == name } @@ -453,7 +469,8 @@ func snapshotTerminalError(snapshot *pbmSnapshot, operation string) (bool, error } func describeFailureError(info describeInfo, operation string) error { - if err := groupDescribeErrors(info); err != nil && err.Error() != "operation failed" { + err := groupDescribeErrors(info) + if err != nil && err.Error() != "operation failed" { return err } return errors.Errorf("%s failed", operation) @@ -536,7 +553,7 @@ func waitForPBMRestore(ctx context.Context, l logrus.FieldLogger, dsn string, re return waitForPBMDescribe(ctx, pbmDescribePollConfig{ l: l, dsn: dsn, - operation: "restore", + operation: pbmCmdRestore, targetName: name, startedAt: time.Now(), describeRetries: &describeRetries, @@ -550,9 +567,6 @@ func waitForPBMRestore(ctx context.Context, l logrus.FieldLogger, dsn string, re } return info, describeErr }, - isRunning: func(status *pbmStatus) bool { - return isPBMRestoreRunning(status) - }, }) } diff --git a/agent/runner/jobs/pbm_helpers_test.go b/agent/runner/jobs/pbm_helpers_test.go index 06bceeb5e38..54acfb360ee 100644 --- a/agent/runner/jobs/pbm_helpers_test.go +++ b/agent/runner/jobs/pbm_helpers_test.go @@ -16,11 +16,11 @@ package jobs import ( "context" + "errors" "os" "testing" "time" - "github.com/pkg/errors" "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -43,7 +43,7 @@ func newTestDescribePollConfig(t *testing.T, opts ...func(*pbmDescribePollConfig cfg := pbmDescribePollConfig{ l: logrus.New(), dsn: "mongodb://localhost", - operation: "backup", + operation: pbmCmdBackup, targetName: "2024-01-01T00:00:00Z", startedAt: time.Now(), describeRetries: &retries, @@ -207,6 +207,43 @@ func TestIsTransientPBMDescribeError(t *testing.T) { } } +func TestOperationIsRunning(t *testing.T) { + t.Parallel() + + backupCfg := pbmDescribePollConfig{ + operation: pbmCmdBackup, + targetName: "backup-1", + } + status := &pbmStatus{} + status.Running.Type = pbmCmdBackup + status.Running.Name = "backup-1" + assert.True(t, backupCfg.operationIsRunning(status)) + + restoreCfg := pbmDescribePollConfig{operation: pbmCmdRestore} + status.Running.Type = pbmCmdRestore + assert.True(t, restoreCfg.operationIsRunning(status)) + + customCfg := pbmDescribePollConfig{ + isRunning: func(*pbmStatus) bool { return true }, + } + assert.True(t, customCfg.operationIsRunning(status)) +} + +func TestSnapshotForTarget(t *testing.T) { + t.Parallel() + + cfg := pbmDescribePollConfig{ + operation: pbmCmdBackup, + targetName: "snap-1", + } + status := &pbmStatus{} + status.Backups.Snapshot = []pbmSnapshot{{Name: "snap-1"}} + assert.NotNil(t, cfg.snapshotForTarget(status)) + + cfg.operation = pbmCmdRestore + assert.Nil(t, cfg.snapshotForTarget(status)) +} + func TestIsPBMBackupRunning(t *testing.T) { t.Parallel() @@ -243,7 +280,7 @@ func TestRetryDescribeCommand(t *testing.T) { retries := 1 cfg := pbmDescribePollConfig{ l: logrus.New(), - operation: "backup", + operation: pbmCmdBackup, describeRetries: &retries, } @@ -255,10 +292,10 @@ func TestRetryDescribeCommand(t *testing.T) { func TestDescribeFailureError(t *testing.T) { t.Parallel() - err := describeFailureError(describeInfo{Status: pbmStatusError}, "backup") + err := describeFailureError(describeInfo{Status: pbmStatusError}, pbmCmdBackup) require.EqualError(t, err, "backup failed") - err = describeFailureError(describeInfo{Status: pbmStatusError, Error: "oplog gap"}, "backup") + err = describeFailureError(describeInfo{Status: pbmStatusError, Error: "oplog gap"}, pbmCmdBackup) require.EqualError(t, err, "oplog gap") } @@ -616,23 +653,23 @@ func TestWritePBMConfigFile(t *testing.T) { func TestDescribeTerminalError(t *testing.T) { t.Parallel() - done, err := describeTerminalError(describeInfo{Status: pbmStatusDone}, "backup") + done, err := describeTerminalError(describeInfo{Status: pbmStatusDone}, pbmCmdBackup) require.NoError(t, err) assert.True(t, done) - done, err = describeTerminalError(describeInfo{Status: pbmStatusCanceled}, "backup") + done, err = describeTerminalError(describeInfo{Status: pbmStatusCanceled}, pbmCmdBackup) require.EqualError(t, err, "backup was canceled") assert.True(t, done) - done, err = describeTerminalError(describeInfo{Status: pbmStatusError, Error: "oplog has insufficient range"}, "backup") + done, err = describeTerminalError(describeInfo{Status: pbmStatusError, Error: "oplog has insufficient range"}, pbmCmdBackup) require.EqualError(t, err, "oplog has insufficient range") assert.True(t, done) - done, err = describeTerminalError(describeInfo{Status: pbmStatusPartlyDone, Error: "partial"}, "backup") + done, err = describeTerminalError(describeInfo{Status: pbmStatusPartlyDone, Error: "partial"}, pbmCmdBackup) require.EqualError(t, err, "partial") assert.True(t, done) - done, err = describeTerminalError(describeInfo{Status: "running"}, "backup") + done, err = describeTerminalError(describeInfo{Status: "running"}, pbmCmdBackup) require.NoError(t, err) assert.False(t, done) } @@ -640,23 +677,23 @@ func TestDescribeTerminalError(t *testing.T) { func TestSnapshotTerminalError(t *testing.T) { t.Parallel() - done, err := snapshotTerminalError(&pbmSnapshot{Status: pbmStatusDone}, "backup") + done, err := snapshotTerminalError(&pbmSnapshot{Status: pbmStatusDone}, pbmCmdBackup) require.NoError(t, err) assert.True(t, done) - done, err = snapshotTerminalError(&pbmSnapshot{Status: pbmStatusCanceled}, "backup") + done, err = snapshotTerminalError(&pbmSnapshot{Status: pbmStatusCanceled}, pbmCmdBackup) require.EqualError(t, err, "backup was canceled") assert.True(t, done) - done, err = snapshotTerminalError(&pbmSnapshot{Status: pbmStatusError, Error: "storage unavailable"}, "backup") + done, err = snapshotTerminalError(&pbmSnapshot{Status: pbmStatusError, Error: "storage unavailable"}, pbmCmdBackup) require.EqualError(t, err, "storage unavailable") assert.True(t, done) - done, err = snapshotTerminalError(&pbmSnapshot{Status: pbmStatusError}, "backup") + done, err = snapshotTerminalError(&pbmSnapshot{Status: pbmStatusError}, pbmCmdBackup) require.EqualError(t, err, "backup failed") assert.True(t, done) - done, err = snapshotTerminalError(&pbmSnapshot{Status: "running"}, "backup") + done, err = snapshotTerminalError(&pbmSnapshot{Status: "running"}, pbmCmdBackup) require.NoError(t, err) assert.False(t, done) } From dbc73939b8e3e8a5b4103bcb0f1ebab642f704f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Mon, 15 Jun 2026 12:02:14 +0200 Subject: [PATCH 04/16] PMM-14576 Improvements. --- agent/runner/jobs/pbm_helpers.go | 7 +++---- agent/runner/jobs/pbm_helpers_test.go | 20 +++++++++++++++++--- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/agent/runner/jobs/pbm_helpers.go b/agent/runner/jobs/pbm_helpers.go index 82bd4dd3ecb..a6985903ed2 100644 --- a/agent/runner/jobs/pbm_helpers.go +++ b/agent/runner/jobs/pbm_helpers.go @@ -390,7 +390,6 @@ func isTransientPBMDescribeError(err error) bool { msg := strings.ToLower(err.Error()) return strings.Contains(msg, "no such file") || strings.Contains(msg, "file is empty") || - strings.Contains(msg, "not found") || strings.Contains(msg, "get backup meta") || strings.Contains(msg, "get snapshot size") || strings.Contains(msg, "missed file") @@ -404,7 +403,7 @@ func (cfg pbmDescribePollConfig) operationIsRunning(status *pbmStatus) bool { case pbmCmdBackup: return isPBMBackupRunning(status, cfg.targetName) case pbmCmdRestore: - return isPBMRestoreRunning(status) + return isPBMRestoreRunning(status, cfg.targetName) default: return false } @@ -424,8 +423,8 @@ func isPBMBackupRunning(status *pbmStatus, name string) bool { return status.Running.Type == pbmCmdBackup && status.Running.Name == name } -func isPBMRestoreRunning(status *pbmStatus) bool { - return status.Running.Type == pbmCmdRestore +func isPBMRestoreRunning(status *pbmStatus, name string) bool { + return status.Running.Type == pbmCmdRestore && status.Running.Name == name } func findPBMSnapshot(status *pbmStatus, name string) *pbmSnapshot { diff --git a/agent/runner/jobs/pbm_helpers_test.go b/agent/runner/jobs/pbm_helpers_test.go index 54acfb360ee..ff09c8cac19 100644 --- a/agent/runner/jobs/pbm_helpers_test.go +++ b/agent/runner/jobs/pbm_helpers_test.go @@ -194,6 +194,11 @@ func TestIsTransientPBMDescribeError(t *testing.T) { err: errors.New("get snapshot size: missed file"), want: true, }, + { + name: "generic not found", + err: errors.New("authentication failed: user not found"), + want: false, + }, { name: "real failure", err: errors.New("permission denied"), @@ -219,9 +224,15 @@ func TestOperationIsRunning(t *testing.T) { status.Running.Name = "backup-1" assert.True(t, backupCfg.operationIsRunning(status)) - restoreCfg := pbmDescribePollConfig{operation: pbmCmdRestore} + restoreCfg := pbmDescribePollConfig{ + operation: pbmCmdRestore, + targetName: "restore-1", + } status.Running.Type = pbmCmdRestore + status.Running.Name = "restore-1" assert.True(t, restoreCfg.operationIsRunning(status)) + status.Running.Name = "restore-2" + assert.False(t, restoreCfg.operationIsRunning(status)) customCfg := pbmDescribePollConfig{ isRunning: func(*pbmStatus) bool { return true }, @@ -261,8 +272,11 @@ func TestIsPBMRestoreRunning(t *testing.T) { status := &pbmStatus{} status.Running.Type = pbmCmdRestore - assert.True(t, isPBMRestoreRunning(status)) - assert.False(t, isPBMRestoreRunning(&pbmStatus{})) + status.Running.Name = "restore-1" + + assert.True(t, isPBMRestoreRunning(status, "restore-1")) + assert.False(t, isPBMRestoreRunning(status, "restore-2")) + assert.False(t, isPBMRestoreRunning(&pbmStatus{}, "restore-1")) } func TestShouldRetryDescribeFailure(t *testing.T) { From c279451fbc5748ba148413b7bf86d7adc67f0090 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Mon, 15 Jun 2026 12:53:22 +0200 Subject: [PATCH 05/16] PMM-14576 Refactor. --- agent/runner/jobs/pbm_helpers.go | 34 +++++++++--------- agent/runner/jobs/pbm_helpers_test.go | 52 ++++----------------------- 2 files changed, 22 insertions(+), 64 deletions(-) diff --git a/agent/runner/jobs/pbm_helpers.go b/agent/runner/jobs/pbm_helpers.go index a6985903ed2..ba5e6bd9915 100644 --- a/agent/runner/jobs/pbm_helpers.go +++ b/agent/runner/jobs/pbm_helpers.go @@ -49,6 +49,8 @@ const ( pbmStatusPartlyDone = "partlyDone" ) +var errPBMOperationFailed = errors.New("operation failed") + type pbmSeverity int type describeInfo struct { @@ -281,12 +283,9 @@ type pbmDescribePollConfig struct { findSnapshot func(*pbmStatus) *pbmSnapshot } -// pbmDescribePollInterval is used by waitForPBMDescribe; tests may override it. +// pbmDescribePollInterval is used by waitForPBMDescribe. var pbmDescribePollInterval = statusCheckInterval -// pbmStatusFetcher fetches PBM status; tests may override it. -var pbmStatusFetcher = getPBMStatus - func waitForPBMBackup(ctx context.Context, l logrus.FieldLogger, dsn string, name string) error { l.Infof("waiting for pbm backup: %s", name) @@ -311,15 +310,16 @@ func waitForPBMDescribe(ctx context.Context, cfg pbmDescribePollConfig) error { defer ticker.Stop() for { + done, err := pollPBMDescribeOnce(ctx, cfg) + if err != nil { + return err + } + if done { + return nil + } + select { case <-ticker.C: - done, err := pollPBMDescribeOnce(ctx, cfg) - if err != nil { - return err - } - if done { - return nil - } case <-ctx.Done(): return ctx.Err() } @@ -335,7 +335,7 @@ func pollPBMDescribeOnce(ctx context.Context, cfg pbmDescribePollConfig) (bool, fetchStatus := cfg.fetchStatus if fetchStatus == nil { - fetchStatus = pbmStatusFetcher + fetchStatus = getPBMStatus } status, statusErr := fetchStatus(ctx, cfg.dsn) @@ -351,6 +351,8 @@ func pollPBMDescribeOnce(ctx context.Context, cfg pbmDescribePollConfig) (bool, if retryDescribeCommand(cfg, describeErr) { return false, nil } + // PBM reports the operation is still running; keep polling. + return false, nil } if snapshot := cfg.snapshotForTarget(status); snapshot != nil { @@ -469,7 +471,7 @@ func snapshotTerminalError(snapshot *pbmSnapshot, operation string) (bool, error func describeFailureError(info describeInfo, operation string) error { err := groupDescribeErrors(info) - if err != nil && err.Error() != "operation failed" { + if err != nil && !errors.Is(err, errPBMOperationFailed) { return err } return errors.Errorf("%s failed", operation) @@ -720,15 +722,11 @@ func groupDescribeErrors(info describeInfo) error { } if len(errMsgs) == 0 { - return errors.New("operation failed") + return errPBMOperationFailed } return errors.New(strings.Join(errMsgs, "; ")) } -func groupPartlyDoneErrors(info describeInfo) error { - return groupDescribeErrors(info) -} - // pbmGetSnapshotTimestamp returns time the backup restores target db to. func pbmGetSnapshotTimestamp(ctx context.Context, l logrus.FieldLogger, dsn string, backupName string) (*time.Time, error) { snapshots, err := getSnapshots(ctx, l, dsn) diff --git a/agent/runner/jobs/pbm_helpers_test.go b/agent/runner/jobs/pbm_helpers_test.go index ff09c8cac19..fb7c02fb342 100644 --- a/agent/runner/jobs/pbm_helpers_test.go +++ b/agent/runner/jobs/pbm_helpers_test.go @@ -238,6 +238,9 @@ func TestOperationIsRunning(t *testing.T) { isRunning: func(*pbmStatus) bool { return true }, } assert.True(t, customCfg.operationIsRunning(status)) + + unknownCfg := pbmDescribePollConfig{operation: "unknown"} + assert.False(t, unknownCfg.operationIsRunning(status)) } func TestSnapshotForTarget(t *testing.T) { @@ -317,7 +320,7 @@ func TestGroupDescribeErrors_AllBranches(t *testing.T) { t.Parallel() err := groupDescribeErrors(describeInfo{}) - require.EqualError(t, err, "operation failed") + require.ErrorIs(t, err, errPBMOperationFailed) err = groupDescribeErrors(describeInfo{Error: "top level"}) require.EqualError(t, err, "top level") @@ -328,24 +331,6 @@ func TestGroupDescribeErrors_AllBranches(t *testing.T) { require.EqualError(t, err, "replset: rs0, error: rs failed") } -func TestGroupPartlyDoneErrors(t *testing.T) { - t.Parallel() - - err := groupPartlyDoneErrors(describeInfo{ - Status: pbmStatusPartlyDone, - ReplSets: []replSet{{ - Name: "rs0", - Status: pbmStatusPartlyDone, - Nodes: []node{{ - Name: "node1", - Status: pbmStatusError, - Error: "copy failed", - }}, - }}, - }) - require.EqualError(t, err, "replset: rs0, node: node1, error: copy failed") -} - func TestPollPBMDescribeOnce(t *testing.T) { t.Parallel() @@ -528,31 +513,6 @@ func TestPollPBMDescribeOnce(t *testing.T) { assert.False(t, done) }) - t.Run("uses default status fetcher", func(t *testing.T) { - t.Parallel() - - oldFetcher := pbmStatusFetcher - pbmStatusFetcher = func(context.Context, string) (*pbmStatus, error) { - return &pbmStatus{}, nil - } - t.Cleanup(func() { - pbmStatusFetcher = oldFetcher - }) - - retries := 0 - cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { - c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) - c.describeRetries = &retries - c.fetchStatus = nil - c.fetchDescribe = func(context.Context) (describeInfo, error) { - return describeInfo{}, errors.New("permission denied") - } - }) - done, err := pollPBMDescribeOnce(context.Background(), cfg) - require.ErrorContains(t, err, "failed to get backup status") - assert.False(t, done) - }) - t.Run("retries after startup grace when retries remain", func(t *testing.T) { t.Parallel() retries := 2 @@ -572,7 +532,7 @@ func TestPollPBMDescribeOnce(t *testing.T) { assert.Equal(t, 1, retries) }) - t.Run("running backup with exhausted retries returns error", func(t *testing.T) { + t.Run("running backup with exhausted retries keeps waiting", func(t *testing.T) { t.Parallel() retries := 0 cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { @@ -592,7 +552,7 @@ func TestPollPBMDescribeOnce(t *testing.T) { } }) done, err := pollPBMDescribeOnce(context.Background(), cfg) - require.ErrorContains(t, err, "failed to get backup status") + require.NoError(t, err) assert.False(t, done) }) From e44ead283ff5857b94b918c8e297ebf2535c116e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Mon, 15 Jun 2026 16:54:24 +0200 Subject: [PATCH 06/16] PMM-14576 Refactor. --- agent/runner/jobs/pbm_helpers.go | 243 ++++++++++++++------------ agent/runner/jobs/pbm_helpers_test.go | 179 ++++++++++--------- 2 files changed, 234 insertions(+), 188 deletions(-) diff --git a/agent/runner/jobs/pbm_helpers.go b/agent/runner/jobs/pbm_helpers.go index ba5e6bd9915..81107a1b852 100644 --- a/agent/runner/jobs/pbm_helpers.go +++ b/agent/runner/jobs/pbm_helpers.go @@ -37,7 +37,7 @@ const ( maxRestoreChecks = 100 maxDescribeCommandRetries = 5 - // PBM waits up to ~33s for backup metadata; allow extra margin for describe-backup. + // PBM waits up to ~33s for backup metadata. Allow extra margin for describe-backup. pbmDescribeStartupGrace = 60 * time.Second pbmCmdBackup = "backup" @@ -271,42 +271,51 @@ func getPBMStatus(ctx context.Context, dsn string) (*pbmStatus, error) { } type pbmDescribePollConfig struct { - l logrus.FieldLogger - dsn string - operation string - targetName string - startedAt time.Time - describeRetries *int - fetchDescribe func(context.Context) (describeInfo, error) - fetchStatus func(context.Context, string) (*pbmStatus, error) - isRunning func(*pbmStatus) bool - findSnapshot func(*pbmStatus) *pbmSnapshot + l logrus.FieldLogger + dsn string + operation string + targetName string + startedAt time.Time + pollInterval time.Duration + describeRetries int + fetchDescribe func(context.Context) (describeInfo, error) + fetchStatus func(context.Context, string) (*pbmStatus, error) + fetchRestoreList func(context.Context) ([]pbmListRestore, error) + isRunning func(*pbmStatus) bool + findSnapshot func(*pbmStatus) *pbmSnapshot } -// pbmDescribePollInterval is used by waitForPBMDescribe. -var pbmDescribePollInterval = statusCheckInterval - -func waitForPBMBackup(ctx context.Context, l logrus.FieldLogger, dsn string, name string) error { - l.Infof("waiting for pbm backup: %s", name) +func (cfg *pbmDescribePollConfig) describePollInterval() time.Duration { + if cfg.pollInterval > 0 { + return cfg.pollInterval + } + return statusCheckInterval +} - describeRetries := maxDescribeCommandRetries - return waitForPBMDescribe(ctx, pbmDescribePollConfig{ +func newDescribePollConfig(l logrus.FieldLogger, dsn, operation, targetName string, fetchDescribe func(context.Context) (describeInfo, error)) *pbmDescribePollConfig { + return &pbmDescribePollConfig{ l: l, dsn: dsn, - operation: pbmCmdBackup, - targetName: name, + operation: operation, + targetName: targetName, startedAt: time.Now(), - describeRetries: &describeRetries, - fetchDescribe: func(ctx context.Context) (describeInfo, error) { - var info describeInfo - err := execPBMCommand(ctx, dsn, &info, "describe-backup", name) - return info, err - }, - }) + describeRetries: maxDescribeCommandRetries, + fetchDescribe: fetchDescribe, + } +} + +func waitForPBMBackup(ctx context.Context, l logrus.FieldLogger, dsn string, name string) error { + l.Infof("waiting for pbm backup: %s", name) + + return waitForPBMDescribe(ctx, newDescribePollConfig(l, dsn, pbmCmdBackup, name, func(ctx context.Context) (describeInfo, error) { + var info describeInfo + err := execPBMCommand(ctx, dsn, &info, "describe-backup", name) + return info, err + })) } -func waitForPBMDescribe(ctx context.Context, cfg pbmDescribePollConfig) error { - ticker := time.NewTicker(pbmDescribePollInterval) +func waitForPBMDescribe(ctx context.Context, cfg *pbmDescribePollConfig) error { + ticker := time.NewTicker(cfg.describePollInterval()) defer ticker.Stop() for { @@ -326,65 +335,95 @@ func waitForPBMDescribe(ctx context.Context, cfg pbmDescribePollConfig) error { } } -func pollPBMDescribeOnce(ctx context.Context, cfg pbmDescribePollConfig) (bool, error) { +func pollPBMDescribeOnce(ctx context.Context, cfg *pbmDescribePollConfig) (bool, error) { info, describeErr := cfg.fetchDescribe(ctx) if describeErr == nil { - *cfg.describeRetries = maxDescribeCommandRetries + cfg.describeRetries = maxDescribeCommandRetries return describeTerminalError(info, cfg.operation) } - fetchStatus := cfg.fetchStatus - if fetchStatus == nil { - fetchStatus = getPBMStatus - } - - status, statusErr := fetchStatus(ctx, cfg.dsn) + status, statusErr := cfg.getPBMStatus(ctx) if statusErr != nil { return false, errors.Wrap(statusErr, "failed to get pbm status") } if cfg.operationIsRunning(status) { - if shouldRetryDescribeFailure(describeErr, cfg.startedAt) { - cfg.l.Debugf("describe-%s transient error while %s is running: %s", cfg.operation, cfg.operation, describeErr) - return false, nil - } - if retryDescribeCommand(cfg, describeErr) { - return false, nil - } - // PBM reports the operation is still running; keep polling. + cfg.logAndRetryDescribeFailure(describeErr, true) return false, nil } - if snapshot := cfg.snapshotForTarget(status); snapshot != nil { - done, err := snapshotTerminalError(snapshot, cfg.operation) - if done { - return true, err - } + if done, err := cfg.statusFallback(ctx, status); done { + return true, err } - if shouldRetryDescribeFailure(describeErr, cfg.startedAt) { - return false, nil - } - if retryDescribeCommand(cfg, describeErr) { + if cfg.logAndRetryDescribeFailure(describeErr, false) { return false, nil } return false, errors.Wrapf(describeErr, "failed to get %s status", cfg.operation) } -func shouldRetryDescribeFailure(err error, startedAt time.Time) bool { - return isTransientPBMDescribeError(err) && time.Since(startedAt) < pbmDescribeStartupGrace +func (cfg *pbmDescribePollConfig) getPBMStatus(ctx context.Context) (*pbmStatus, error) { + if cfg.fetchStatus != nil { + return cfg.fetchStatus(ctx, cfg.dsn) + } + return getPBMStatus(ctx, cfg.dsn) +} + +func (cfg *pbmDescribePollConfig) logAndRetryDescribeFailure(describeErr error, running bool) bool { + if shouldRetryDescribeFailure(describeErr, cfg.startedAt) { + if running { + cfg.l.Debugf("describe-%s transient error while %s is running: %s", cfg.operation, cfg.operation, describeErr) + } + return true + } + return cfg.retryDescribeCommand(describeErr) } -func retryDescribeCommand(cfg pbmDescribePollConfig, err error) bool { - if *cfg.describeRetries <= 0 { +func (cfg *pbmDescribePollConfig) retryDescribeCommand(err error) bool { + if cfg.describeRetries <= 0 { return false } - *cfg.describeRetries-- + cfg.describeRetries-- cfg.l.Warnf("describe-%s failed and will retry: %s", cfg.operation, err) return true } +func (cfg *pbmDescribePollConfig) statusFallback(ctx context.Context, status *pbmStatus) (bool, error) { + if snapshot := cfg.snapshotForTarget(status); snapshot != nil { + return terminalStatusError(snapshot.Status, snapshot.Error, cfg.operation) + } + + if cfg.operation != pbmCmdRestore { + return false, nil + } + + list, err := cfg.fetchRestoreListOrDefault(ctx) + if err != nil { + cfg.l.Debugf("failed to get restore list for fallback: %s", err) + return false, nil + } + if restore := findPBMListRestore(list, cfg.targetName); restore != nil { + return terminalStatusError(restore.Status, restore.Error, cfg.operation) + } + return false, nil +} + +func (cfg *pbmDescribePollConfig) fetchRestoreListOrDefault(ctx context.Context) ([]pbmListRestore, error) { + if cfg.fetchRestoreList != nil { + return cfg.fetchRestoreList(ctx) + } + var list []pbmListRestore + err := execPBMCommand(ctx, cfg.dsn, &list, "list", "--restore") + return list, err +} + +func shouldRetryDescribeFailure(err error, startedAt time.Time) bool { + return isTransientPBMDescribeError(err) && time.Since(startedAt) < pbmDescribeStartupGrace +} + +// isTransientPBMDescribeError reports whether describe-backup/restore may fail +// temporarily while PBM metadata is not ready yet. func isTransientPBMDescribeError(err error) bool { if err == nil { return false @@ -397,21 +436,17 @@ func isTransientPBMDescribeError(err error) bool { strings.Contains(msg, "missed file") } -func (cfg pbmDescribePollConfig) operationIsRunning(status *pbmStatus) bool { +func (cfg *pbmDescribePollConfig) operationIsRunning(status *pbmStatus) bool { if cfg.isRunning != nil { return cfg.isRunning(status) } - switch cfg.operation { - case pbmCmdBackup: - return isPBMBackupRunning(status, cfg.targetName) - case pbmCmdRestore: - return isPBMRestoreRunning(status, cfg.targetName) - default: + if cfg.operation != pbmCmdBackup && cfg.operation != pbmCmdRestore { return false } + return status.Running.Type == cfg.operation && status.Running.Name == cfg.targetName } -func (cfg pbmDescribePollConfig) snapshotForTarget(status *pbmStatus) *pbmSnapshot { +func (cfg *pbmDescribePollConfig) snapshotForTarget(status *pbmStatus) *pbmSnapshot { if cfg.findSnapshot != nil { return cfg.findSnapshot(status) } @@ -421,49 +456,53 @@ func (cfg pbmDescribePollConfig) snapshotForTarget(status *pbmStatus) *pbmSnapsh return nil } -func isPBMBackupRunning(status *pbmStatus, name string) bool { - return status.Running.Type == pbmCmdBackup && status.Running.Name == name -} - -func isPBMRestoreRunning(status *pbmStatus, name string) bool { - return status.Running.Type == pbmCmdRestore && status.Running.Name == name +func findPBMSnapshot(status *pbmStatus, name string) *pbmSnapshot { + i := slices.IndexFunc(status.Backups.Snapshot, func(s pbmSnapshot) bool { + return s.Name == name + }) + if i < 0 { + return nil + } + return &status.Backups.Snapshot[i] } -func findPBMSnapshot(status *pbmStatus, name string) *pbmSnapshot { - for i := range status.Backups.Snapshot { - if status.Backups.Snapshot[i].Name == name { - return &status.Backups.Snapshot[i] - } +func findPBMListRestore(list []pbmListRestore, name string) *pbmListRestore { + i := slices.IndexFunc(list, func(r pbmListRestore) bool { + return r.Name == name + }) + if i < 0 { + return nil } - return nil + return &list[i] } func describeTerminalError(info describeInfo, operation string) (bool, error) { switch info.Status { - case pbmStatusDone: - return true, nil - case pbmStatusCanceled: - return true, errors.Errorf("%s was canceled", operation) case pbmStatusError: return true, describeFailureError(info, operation) case pbmStatusPartlyDone: return true, groupDescribeErrors(info) default: - return false, nil + return terminalStatusError(info.Status, info.Error, operation) } } -func snapshotTerminalError(snapshot *pbmSnapshot, operation string) (bool, error) { - switch snapshot.Status { +func terminalStatusError(status, errMsg, operation string) (bool, error) { + switch status { case pbmStatusDone: return true, nil case pbmStatusCanceled: return true, errors.Errorf("%s was canceled", operation) case pbmStatusError: - if snapshot.Error != "" { - return true, errors.New(snapshot.Error) + if errMsg != "" { + return true, errors.New(errMsg) } return true, errors.Errorf("%s failed", operation) + case pbmStatusPartlyDone: + if errMsg != "" { + return true, errors.New(errMsg) + } + return true, errors.Errorf("%s partly completed", operation) default: return false, nil } @@ -550,25 +589,15 @@ func waitForPBMRestore(ctx context.Context, l logrus.FieldLogger, dsn string, re l.Infof("waiting for pbm restore: %s", name) - describeRetries := maxDescribeCommandRetries - return waitForPBMDescribe(ctx, pbmDescribePollConfig{ - l: l, - dsn: dsn, - operation: pbmCmdRestore, - targetName: name, - startedAt: time.Now(), - describeRetries: &describeRetries, - fetchDescribe: func(ctx context.Context) (describeInfo, error) { - var info describeInfo - var describeErr error - if backupType == "physical" { - describeErr = execPBMCommand(ctx, dsn, &info, "describe-restore", name, "--config="+confFile) - } else { - describeErr = execPBMCommand(ctx, dsn, &info, "describe-restore", name) - } - return info, describeErr - }, - }) + return waitForPBMDescribe(ctx, newDescribePollConfig(l, dsn, pbmCmdRestore, name, func(ctx context.Context) (describeInfo, error) { + var info describeInfo + if backupType == "physical" { + err := execPBMCommand(ctx, dsn, &info, "describe-restore", name, "--config="+confFile) + return info, err + } + err := execPBMCommand(ctx, dsn, &info, "describe-restore", name) + return info, err + })) } func pbmConfigure(ctx context.Context, l logrus.FieldLogger, params pbmConfigParams) error { diff --git a/agent/runner/jobs/pbm_helpers_test.go b/agent/runner/jobs/pbm_helpers_test.go index fb7c02fb342..3221c3ace9d 100644 --- a/agent/runner/jobs/pbm_helpers_test.go +++ b/agent/runner/jobs/pbm_helpers_test.go @@ -26,36 +26,22 @@ import ( "github.com/stretchr/testify/require" ) -func withFastPBMDescribePoll(t *testing.T) { +func newTestDescribePollConfig(t *testing.T, opts ...func(*pbmDescribePollConfig)) *pbmDescribePollConfig { t.Helper() - old := pbmDescribePollInterval - pbmDescribePollInterval = time.Millisecond - t.Cleanup(func() { - pbmDescribePollInterval = old - }) -} - -func newTestDescribePollConfig(t *testing.T, opts ...func(*pbmDescribePollConfig)) pbmDescribePollConfig { - t.Helper() - - retries := maxDescribeCommandRetries - cfg := pbmDescribePollConfig{ + cfg := &pbmDescribePollConfig{ l: logrus.New(), dsn: "mongodb://localhost", operation: pbmCmdBackup, targetName: "2024-01-01T00:00:00Z", startedAt: time.Now(), - describeRetries: &retries, + describeRetries: maxDescribeCommandRetries, fetchDescribe: func(context.Context) (describeInfo, error) { return describeInfo{Status: pbmStatusDone}, nil }, - isRunning: func(*pbmStatus) bool { - return false - }, } for _, opt := range opts { - opt(&cfg) + opt(cfg) } return cfg } @@ -215,7 +201,7 @@ func TestIsTransientPBMDescribeError(t *testing.T) { func TestOperationIsRunning(t *testing.T) { t.Parallel() - backupCfg := pbmDescribePollConfig{ + backupCfg := &pbmDescribePollConfig{ operation: pbmCmdBackup, targetName: "backup-1", } @@ -224,7 +210,7 @@ func TestOperationIsRunning(t *testing.T) { status.Running.Name = "backup-1" assert.True(t, backupCfg.operationIsRunning(status)) - restoreCfg := pbmDescribePollConfig{ + restoreCfg := &pbmDescribePollConfig{ operation: pbmCmdRestore, targetName: "restore-1", } @@ -234,19 +220,19 @@ func TestOperationIsRunning(t *testing.T) { status.Running.Name = "restore-2" assert.False(t, restoreCfg.operationIsRunning(status)) - customCfg := pbmDescribePollConfig{ + customCfg := &pbmDescribePollConfig{ isRunning: func(*pbmStatus) bool { return true }, } assert.True(t, customCfg.operationIsRunning(status)) - unknownCfg := pbmDescribePollConfig{operation: "unknown"} + unknownCfg := &pbmDescribePollConfig{operation: "unknown"} assert.False(t, unknownCfg.operationIsRunning(status)) } func TestSnapshotForTarget(t *testing.T) { t.Parallel() - cfg := pbmDescribePollConfig{ + cfg := &pbmDescribePollConfig{ operation: pbmCmdBackup, targetName: "snap-1", } @@ -258,30 +244,6 @@ func TestSnapshotForTarget(t *testing.T) { assert.Nil(t, cfg.snapshotForTarget(status)) } -func TestIsPBMBackupRunning(t *testing.T) { - t.Parallel() - - status := &pbmStatus{} - status.Running.Type = pbmCmdBackup - status.Running.Name = "backup-1" - - assert.True(t, isPBMBackupRunning(status, "backup-1")) - assert.False(t, isPBMBackupRunning(status, "backup-2")) - assert.False(t, isPBMBackupRunning(&pbmStatus{}, "backup-1")) -} - -func TestIsPBMRestoreRunning(t *testing.T) { - t.Parallel() - - status := &pbmStatus{} - status.Running.Type = pbmCmdRestore - status.Running.Name = "restore-1" - - assert.True(t, isPBMRestoreRunning(status, "restore-1")) - assert.False(t, isPBMRestoreRunning(status, "restore-2")) - assert.False(t, isPBMRestoreRunning(&pbmStatus{}, "restore-1")) -} - func TestShouldRetryDescribeFailure(t *testing.T) { t.Parallel() @@ -294,16 +256,15 @@ func TestShouldRetryDescribeFailure(t *testing.T) { func TestRetryDescribeCommand(t *testing.T) { t.Parallel() - retries := 1 - cfg := pbmDescribePollConfig{ + cfg := &pbmDescribePollConfig{ l: logrus.New(), operation: pbmCmdBackup, - describeRetries: &retries, + describeRetries: 1, } - assert.True(t, retryDescribeCommand(cfg, errors.New("temporary"))) - assert.Equal(t, 0, retries) - assert.False(t, retryDescribeCommand(cfg, errors.New("temporary"))) + assert.True(t, cfg.retryDescribeCommand(errors.New("temporary"))) + assert.Equal(t, 0, cfg.describeRetries) + assert.False(t, cfg.retryDescribeCommand(errors.New("temporary"))) } func TestDescribeFailureError(t *testing.T) { @@ -415,9 +376,6 @@ func TestPollPBMDescribeOnce(t *testing.T) { status.Running.Name = c.targetName return status, nil } - c.isRunning = func(status *pbmStatus) bool { - return isPBMBackupRunning(status, c.targetName) - } }) done, err := pollPBMDescribeOnce(context.Background(), cfg) require.NoError(t, err) @@ -426,10 +384,9 @@ func TestPollPBMDescribeOnce(t *testing.T) { t.Run("running backup retries describe command", func(t *testing.T) { t.Parallel() - retries := maxDescribeCommandRetries cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) - c.describeRetries = &retries + c.describeRetries = maxDescribeCommandRetries c.fetchDescribe = func(context.Context) (describeInfo, error) { return describeInfo{}, errors.New("permission denied") } @@ -439,14 +396,11 @@ func TestPollPBMDescribeOnce(t *testing.T) { status.Running.Name = c.targetName return status, nil } - c.isRunning = func(status *pbmStatus) bool { - return isPBMBackupRunning(status, c.targetName) - } }) done, err := pollPBMDescribeOnce(context.Background(), cfg) require.NoError(t, err) assert.False(t, done) - assert.Equal(t, maxDescribeCommandRetries-1, retries) + assert.Equal(t, maxDescribeCommandRetries-1, cfg.describeRetries) }) t.Run("snapshot done when describe fails", func(t *testing.T) { @@ -498,6 +452,56 @@ func TestPollPBMDescribeOnce(t *testing.T) { assert.True(t, done) }) + t.Run("restore done when describe fails", func(t *testing.T) { + t.Parallel() + cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + c.operation = pbmCmdRestore + c.targetName = "2024-01-01T12:00:00Z" + c.fetchDescribe = func(context.Context) (describeInfo, error) { + return describeInfo{}, errors.New("no such file") + } + c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) + c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { + return &pbmStatus{}, nil + } + c.fetchRestoreList = func(context.Context) ([]pbmListRestore, error) { + return []pbmListRestore{{ + Name: c.targetName, + Status: pbmStatusDone, + }}, nil + } + }) + done, err := pollPBMDescribeOnce(context.Background(), cfg) + require.NoError(t, err) + assert.True(t, done) + }) + + t.Run("restore terminal error when describe fails", func(t *testing.T) { + t.Parallel() + cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + c.operation = pbmCmdRestore + c.targetName = "2024-01-01T12:00:00Z" + c.fetchDescribe = func(context.Context) (describeInfo, error) { + return describeInfo{}, errors.New("permission denied") + } + c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) + c.describeRetries = 0 + c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { + return &pbmStatus{}, nil + } + c.fetchRestoreList = func(context.Context) ([]pbmListRestore, error) { + return []pbmListRestore{{ + Name: c.targetName, + Status: pbmStatusError, + Error: "node copy failed", + }}, nil + } + }) + done, err := pollPBMDescribeOnce(context.Background(), cfg) + require.EqualError(t, err, "node copy failed") + assert.True(t, done) + }) + t.Run("startup grace for transient error", func(t *testing.T) { t.Parallel() cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { @@ -515,10 +519,9 @@ func TestPollPBMDescribeOnce(t *testing.T) { t.Run("retries after startup grace when retries remain", func(t *testing.T) { t.Parallel() - retries := 2 cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) - c.describeRetries = &retries + c.describeRetries = 2 c.fetchDescribe = func(context.Context) (describeInfo, error) { return describeInfo{}, errors.New("file is empty") } @@ -529,15 +532,14 @@ func TestPollPBMDescribeOnce(t *testing.T) { done, err := pollPBMDescribeOnce(context.Background(), cfg) require.NoError(t, err) assert.False(t, done) - assert.Equal(t, 1, retries) + assert.Equal(t, 1, cfg.describeRetries) }) t.Run("running backup with exhausted retries keeps waiting", func(t *testing.T) { t.Parallel() - retries := 0 cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) - c.describeRetries = &retries + c.describeRetries = 0 c.fetchDescribe = func(context.Context) (describeInfo, error) { return describeInfo{}, errors.New("permission denied") } @@ -547,9 +549,6 @@ func TestPollPBMDescribeOnce(t *testing.T) { status.Running.Name = c.targetName return status, nil } - c.isRunning = func(status *pbmStatus) bool { - return isPBMBackupRunning(status, c.targetName) - } }) done, err := pollPBMDescribeOnce(context.Background(), cfg) require.NoError(t, err) @@ -558,10 +557,9 @@ func TestPollPBMDescribeOnce(t *testing.T) { t.Run("describe failure without running backup", func(t *testing.T) { t.Parallel() - retries := 0 cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) - c.describeRetries = &retries + c.describeRetries = 0 c.fetchDescribe = func(context.Context) (describeInfo, error) { return describeInfo{}, errors.New("permission denied") } @@ -576,15 +574,17 @@ func TestPollPBMDescribeOnce(t *testing.T) { } func TestWaitForPBMDescribe(t *testing.T) { - withFastPBMDescribePoll(t) - t.Run("completes when describe reports done", func(t *testing.T) { - err := waitForPBMDescribe(context.Background(), newTestDescribePollConfig(t)) + cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + c.pollInterval = time.Millisecond + }) + err := waitForPBMDescribe(context.Background(), cfg) require.NoError(t, err) }) t.Run("returns describe error", func(t *testing.T) { cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + c.pollInterval = time.Millisecond c.fetchDescribe = func(context.Context) (describeInfo, error) { return describeInfo{Status: pbmStatusCanceled}, nil } @@ -597,6 +597,7 @@ func TestWaitForPBMDescribe(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) cancel() err := waitForPBMDescribe(ctx, newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + c.pollInterval = time.Millisecond c.fetchDescribe = func(context.Context) (describeInfo, error) { return describeInfo{Status: "running"}, nil } @@ -648,30 +649,46 @@ func TestDescribeTerminalError(t *testing.T) { assert.False(t, done) } -func TestSnapshotTerminalError(t *testing.T) { +func TestTerminalStatusError(t *testing.T) { t.Parallel() - done, err := snapshotTerminalError(&pbmSnapshot{Status: pbmStatusDone}, pbmCmdBackup) + done, err := terminalStatusError(pbmStatusDone, "", pbmCmdBackup) require.NoError(t, err) assert.True(t, done) - done, err = snapshotTerminalError(&pbmSnapshot{Status: pbmStatusCanceled}, pbmCmdBackup) + done, err = terminalStatusError(pbmStatusCanceled, "", pbmCmdBackup) require.EqualError(t, err, "backup was canceled") assert.True(t, done) - done, err = snapshotTerminalError(&pbmSnapshot{Status: pbmStatusError, Error: "storage unavailable"}, pbmCmdBackup) + done, err = terminalStatusError(pbmStatusError, "storage unavailable", pbmCmdBackup) require.EqualError(t, err, "storage unavailable") assert.True(t, done) - done, err = snapshotTerminalError(&pbmSnapshot{Status: pbmStatusError}, pbmCmdBackup) + done, err = terminalStatusError(pbmStatusError, "", pbmCmdBackup) require.EqualError(t, err, "backup failed") assert.True(t, done) - done, err = snapshotTerminalError(&pbmSnapshot{Status: "running"}, pbmCmdBackup) + done, err = terminalStatusError(pbmStatusPartlyDone, "", pbmCmdRestore) + require.EqualError(t, err, "restore partly completed") + assert.True(t, done) + + done, err = terminalStatusError("running", "", pbmCmdBackup) require.NoError(t, err) assert.False(t, done) } +func TestFindPBMListRestore(t *testing.T) { + t.Parallel() + + list := []pbmListRestore{ + {Name: "2024-01-01T00:00:00Z", Status: pbmStatusDone}, + {Name: "2024-01-02T00:00:00Z", Status: pbmStatusError, Error: "failed"}, + } + + assert.Nil(t, findPBMListRestore(list, "missing")) + require.NotNil(t, findPBMListRestore(list, "2024-01-02T00:00:00Z")) +} + func TestFindPBMSnapshot(t *testing.T) { t.Parallel() From 80d4fcbd4519e48a3724a44b19db14546b68137f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Tue, 16 Jun 2026 10:18:20 +0200 Subject: [PATCH 07/16] PMM-14576 Refactor. --- agent/runner/jobs/pbm_helpers.go | 262 ++++++++++++------- agent/runner/jobs/pbm_helpers_test.go | 352 ++++++++++++++++---------- 2 files changed, 389 insertions(+), 225 deletions(-) diff --git a/agent/runner/jobs/pbm_helpers.go b/agent/runner/jobs/pbm_helpers.go index 81107a1b852..f1c988af4a4 100644 --- a/agent/runner/jobs/pbm_helpers.go +++ b/agent/runner/jobs/pbm_helpers.go @@ -36,9 +36,12 @@ const ( statusCheckInterval = 5 * time.Second maxRestoreChecks = 100 - maxDescribeCommandRetries = 5 + maxDescribeRetries = 5 // PBM waits up to ~33s for backup metadata. Allow extra margin for describe-backup. - pbmDescribeStartupGrace = 60 * time.Second + describeStartupGrace = 60 * time.Second + // After the operation stops, status/list metadata can lag behind describe failures. + describeCompletionGrace = 5 * time.Minute + describeRunningWarnInterval = 5 * time.Minute pbmCmdBackup = "backup" pbmCmdRestore = "restore" @@ -270,56 +273,62 @@ func getPBMStatus(ctx context.Context, dsn string) (*pbmStatus, error) { return &status, nil } -type pbmDescribePollConfig struct { - l logrus.FieldLogger - dsn string - operation string - targetName string - startedAt time.Time - pollInterval time.Duration - describeRetries int - fetchDescribe func(context.Context) (describeInfo, error) - fetchStatus func(context.Context, string) (*pbmStatus, error) +type describePoller struct { + l logrus.FieldLogger + dsn string + operation string + name string + startedAt time.Time + finishedAt time.Time + lastRunningWarn time.Time + pollEvery time.Duration + retries int + fetchDescribe func(context.Context) (describeInfo, error) + fetchStatus func(context.Context, string) (*pbmStatus, error) fetchRestoreList func(context.Context) ([]pbmListRestore, error) - isRunning func(*pbmStatus) bool - findSnapshot func(*pbmStatus) *pbmSnapshot + isRunning func(*pbmStatus) bool + findSnapshot func(*pbmStatus) *pbmSnapshot } -func (cfg *pbmDescribePollConfig) describePollInterval() time.Duration { - if cfg.pollInterval > 0 { - return cfg.pollInterval +func (cfg *describePoller) interval() time.Duration { + if cfg.pollEvery > 0 { + return cfg.pollEvery } return statusCheckInterval } -func newDescribePollConfig(l logrus.FieldLogger, dsn, operation, targetName string, fetchDescribe func(context.Context) (describeInfo, error)) *pbmDescribePollConfig { - return &pbmDescribePollConfig{ - l: l, - dsn: dsn, - operation: operation, - targetName: targetName, - startedAt: time.Now(), - describeRetries: maxDescribeCommandRetries, - fetchDescribe: fetchDescribe, +func newDescribePoller(l logrus.FieldLogger, dsn, operation, name string, fetchDescribe func(context.Context) (describeInfo, error)) *describePoller { + return &describePoller{ + l: l, + dsn: dsn, + operation: operation, + name: name, + startedAt: time.Now(), + retries: maxDescribeRetries, + fetchDescribe: fetchDescribe, } } func waitForPBMBackup(ctx context.Context, l logrus.FieldLogger, dsn string, name string) error { l.Infof("waiting for pbm backup: %s", name) - return waitForPBMDescribe(ctx, newDescribePollConfig(l, dsn, pbmCmdBackup, name, func(ctx context.Context) (describeInfo, error) { + return waitDescribe(ctx, newDescribePoller(l, dsn, pbmCmdBackup, name, func(ctx context.Context) (describeInfo, error) { var info describeInfo err := execPBMCommand(ctx, dsn, &info, "describe-backup", name) return info, err })) } -func waitForPBMDescribe(ctx context.Context, cfg *pbmDescribePollConfig) error { - ticker := time.NewTicker(cfg.describePollInterval()) +func waitDescribe(ctx context.Context, cfg *describePoller) error { + ticker := time.NewTicker(cfg.interval()) defer ticker.Stop() for { - done, err := pollPBMDescribeOnce(ctx, cfg) + if err := ctx.Err(); err != nil { + return err + } + + done, err := pollDescribeOnce(ctx, cfg) if err != nil { return err } @@ -335,11 +344,11 @@ func waitForPBMDescribe(ctx context.Context, cfg *pbmDescribePollConfig) error { } } -func pollPBMDescribeOnce(ctx context.Context, cfg *pbmDescribePollConfig) (bool, error) { +func pollDescribeOnce(ctx context.Context, cfg *describePoller) (bool, error) { info, describeErr := cfg.fetchDescribe(ctx) if describeErr == nil { - cfg.describeRetries = maxDescribeCommandRetries - return describeTerminalError(info, cfg.operation) + cfg.retries = maxDescribeRetries + return checkDescribe(info, cfg.operation) } status, statusErr := cfg.getPBMStatus(ctx) @@ -347,8 +356,12 @@ func pollPBMDescribeOnce(ctx context.Context, cfg *pbmDescribePollConfig) (bool, return false, errors.Wrap(statusErr, "failed to get pbm status") } - if cfg.operationIsRunning(status) { - cfg.logAndRetryDescribeFailure(describeErr, true) + running := cfg.opRunning(status) + cfg.trackFinished(running) + + if running { + cfg.warnRunningDescribe(describeErr) + cfg.retryDescribeErr(describeErr, true) return false, nil } @@ -356,60 +369,97 @@ func pollPBMDescribeOnce(ctx context.Context, cfg *pbmDescribePollConfig) (bool, return true, err } - if cfg.logAndRetryDescribeFailure(describeErr, false) { + if cfg.retryDescribeErr(describeErr, false) { return false, nil } return false, errors.Wrapf(describeErr, "failed to get %s status", cfg.operation) } -func (cfg *pbmDescribePollConfig) getPBMStatus(ctx context.Context) (*pbmStatus, error) { +func (cfg *describePoller) getPBMStatus(ctx context.Context) (*pbmStatus, error) { if cfg.fetchStatus != nil { return cfg.fetchStatus(ctx, cfg.dsn) } return getPBMStatus(ctx, cfg.dsn) } -func (cfg *pbmDescribePollConfig) logAndRetryDescribeFailure(describeErr error, running bool) bool { - if shouldRetryDescribeFailure(describeErr, cfg.startedAt) { +func (cfg *describePoller) trackFinished(running bool) { + if running { + cfg.finishedAt = time.Time{} + return + } + if cfg.finishedAt.IsZero() { + cfg.finishedAt = time.Now() + } +} + +func (cfg *describePoller) describeCmd() string { + return "describe-" + cfg.operation +} + +func (cfg *describePoller) warnRunningDescribe(describeErr error) { + if cfg.startedAt.IsZero() { + return + } + if time.Since(cfg.lastWarnAt()) < describeRunningWarnInterval { + return + } + cfg.lastRunningWarn = time.Now() + cfg.l.Warnf("%s %q is still running but %s keeps failing: %v", + cfg.operation, cfg.name, cfg.describeCmd(), describeErr) +} + +func (cfg *describePoller) lastWarnAt() time.Time { + if !cfg.lastRunningWarn.IsZero() { + return cfg.lastRunningWarn + } + return cfg.startedAt +} + +func (cfg *describePoller) retryDescribeErr(describeErr error, running bool) bool { + if retryTransient(describeErr, cfg, running) { if running { - cfg.l.Debugf("describe-%s transient error while %s is running: %s", cfg.operation, cfg.operation, describeErr) + cfg.l.Debugf("%s transient error while %s %q is still running: %v", + cfg.describeCmd(), cfg.operation, cfg.name, describeErr) + } else { + cfg.l.Debugf("%s transient error while waiting for %s %q completion metadata: %v", + cfg.describeCmd(), cfg.operation, cfg.name, describeErr) } return true } - return cfg.retryDescribeCommand(describeErr) + return cfg.retryDescribeCmd(describeErr) } -func (cfg *pbmDescribePollConfig) retryDescribeCommand(err error) bool { - if cfg.describeRetries <= 0 { +func (cfg *describePoller) retryDescribeCmd(err error) bool { + if cfg.retries <= 0 { return false } - cfg.describeRetries-- - cfg.l.Warnf("describe-%s failed and will retry: %s", cfg.operation, err) + cfg.retries-- + cfg.l.Warnf("%s failed and will retry: %s", cfg.describeCmd(), err) return true } -func (cfg *pbmDescribePollConfig) statusFallback(ctx context.Context, status *pbmStatus) (bool, error) { - if snapshot := cfg.snapshotForTarget(status); snapshot != nil { - return terminalStatusError(snapshot.Status, snapshot.Error, cfg.operation) +func (cfg *describePoller) statusFallback(ctx context.Context, status *pbmStatus) (bool, error) { + if snapshot := cfg.targetSnapshot(status); snapshot != nil { + return checkStatus(snapshot.Status, snapshot.Error, cfg.operation) } if cfg.operation != pbmCmdRestore { return false, nil } - list, err := cfg.fetchRestoreListOrDefault(ctx) + list, err := cfg.listRestores(ctx) if err != nil { cfg.l.Debugf("failed to get restore list for fallback: %s", err) return false, nil } - if restore := findPBMListRestore(list, cfg.targetName); restore != nil { - return terminalStatusError(restore.Status, restore.Error, cfg.operation) + if restore := restoreByName(list, cfg.name); restore != nil { + return checkStatus(restore.Status, restore.Error, cfg.operation) } return false, nil } -func (cfg *pbmDescribePollConfig) fetchRestoreListOrDefault(ctx context.Context) ([]pbmListRestore, error) { +func (cfg *describePoller) listRestores(ctx context.Context) ([]pbmListRestore, error) { if cfg.fetchRestoreList != nil { return cfg.fetchRestoreList(ctx) } @@ -418,45 +468,56 @@ func (cfg *pbmDescribePollConfig) fetchRestoreListOrDefault(ctx context.Context) return list, err } -func shouldRetryDescribeFailure(err error, startedAt time.Time) bool { - return isTransientPBMDescribeError(err) && time.Since(startedAt) < pbmDescribeStartupGrace +func retryTransient(err error, cfg *describePoller, running bool) bool { + if !isTransientDescribeErr(err) { + return false + } + if running { + return time.Since(cfg.startedAt) < describeStartupGrace + } + since := cfg.startedAt + if !cfg.finishedAt.IsZero() { + since = cfg.finishedAt + } + return time.Since(since) < describeCompletionGrace } -// isTransientPBMDescribeError reports whether describe-backup/restore may fail -// temporarily while PBM metadata is not ready yet. -func isTransientPBMDescribeError(err error) bool { +// isTransientDescribeErr reports whether describe-backup/restore may fail +// temporarily while PBM metadata is not ready yet. Matches are based on known +// PBM CLI error texts and should be updated when PBM changes them. +func isTransientDescribeErr(err error) bool { if err == nil { return false } msg := strings.ToLower(err.Error()) return strings.Contains(msg, "no such file") || strings.Contains(msg, "file is empty") || - strings.Contains(msg, "get backup meta") || - strings.Contains(msg, "get snapshot size") || - strings.Contains(msg, "missed file") + strings.Contains(msg, "missed file") || + (strings.Contains(msg, "get backup meta") && strings.Contains(msg, "not found")) || + strings.Contains(msg, "get snapshot size") } -func (cfg *pbmDescribePollConfig) operationIsRunning(status *pbmStatus) bool { +func (cfg *describePoller) opRunning(status *pbmStatus) bool { if cfg.isRunning != nil { return cfg.isRunning(status) } if cfg.operation != pbmCmdBackup && cfg.operation != pbmCmdRestore { return false } - return status.Running.Type == cfg.operation && status.Running.Name == cfg.targetName + return status.Running.Type == cfg.operation && status.Running.Name == cfg.name } -func (cfg *pbmDescribePollConfig) snapshotForTarget(status *pbmStatus) *pbmSnapshot { +func (cfg *describePoller) targetSnapshot(status *pbmStatus) *pbmSnapshot { if cfg.findSnapshot != nil { return cfg.findSnapshot(status) } if cfg.operation == pbmCmdBackup { - return findPBMSnapshot(status, cfg.targetName) + return snapshotByName(status, cfg.name) } return nil } -func findPBMSnapshot(status *pbmStatus, name string) *pbmSnapshot { +func snapshotByName(status *pbmStatus, name string) *pbmSnapshot { i := slices.IndexFunc(status.Backups.Snapshot, func(s pbmSnapshot) bool { return s.Name == name }) @@ -466,7 +527,7 @@ func findPBMSnapshot(status *pbmStatus, name string) *pbmSnapshot { return &status.Backups.Snapshot[i] } -func findPBMListRestore(list []pbmListRestore, name string) *pbmListRestore { +func restoreByName(list []pbmListRestore, name string) *pbmListRestore { i := slices.IndexFunc(list, func(r pbmListRestore) bool { return r.Name == name }) @@ -476,18 +537,18 @@ func findPBMListRestore(list []pbmListRestore, name string) *pbmListRestore { return &list[i] } -func describeTerminalError(info describeInfo, operation string) (bool, error) { +func checkDescribe(info describeInfo, operation string) (bool, error) { switch info.Status { case pbmStatusError: - return true, describeFailureError(info, operation) + return true, describeErr(info, operation) case pbmStatusPartlyDone: - return true, groupDescribeErrors(info) + return true, groupDescribeErrs(info) default: - return terminalStatusError(info.Status, info.Error, operation) + return checkStatus(info.Status, info.Error, operation) } } -func terminalStatusError(status, errMsg, operation string) (bool, error) { +func checkStatus(status, errMsg, operation string) (bool, error) { switch status { case pbmStatusDone: return true, nil @@ -508,8 +569,8 @@ func terminalStatusError(status, errMsg, operation string) (bool, error) { } } -func describeFailureError(info describeInfo, operation string) error { - err := groupDescribeErrors(info) +func describeErr(info describeInfo, operation string) error { + err := groupDescribeErrs(info) if err != nil && !errors.Is(err, errPBMOperationFailed) { return err } @@ -549,29 +610,42 @@ func findPITRRestoreName(ctx context.Context, dsn string, restoreInfo *pbmRestor checks := 0 for { + if err := ctx.Err(); err != nil { + return "", err + } + + checks++ + var list []pbmListRestore + err := execPBMCommand(ctx, dsn, &list, "list", "--restore") + if err != nil { + return "", errors.Wrapf(err, "pbm status error") + } + entry := findPITRRestore(list, restoreInfoPITRTime.Unix(), restoreInfo.StartedAt) + if entry != nil { + return entry.Name, nil + } + if checks > maxRestoreChecks { + return "", errors.Errorf("failed to start restore") + } + select { case <-ticker.C: - checks++ - var list []pbmListRestore - err := execPBMCommand(ctx, dsn, &list, "list", "--restore") - if err != nil { - return "", errors.Wrapf(err, "pbm status error") - } - entry := findPITRRestore(list, restoreInfoPITRTime.Unix(), restoreInfo.StartedAt) - if entry == nil { - if checks > maxRestoreChecks { - return "", errors.Errorf("failed to start restore") - } - continue - } else { - return entry.Name, nil - } case <-ctx.Done(): return "", ctx.Err() } } } +func fetchRestoreDescribe(ctx context.Context, dsn, name, backupType, confFile string) (describeInfo, error) { + var info describeInfo + args := []string{"describe-restore", name} + if backupType == "physical" { + args = append(args, "--config="+confFile) + } + err := execPBMCommand(ctx, dsn, &info, args...) + return info, err +} + func waitForPBMRestore(ctx context.Context, l logrus.FieldLogger, dsn string, restoreInfo *pbmRestore, backupType, confFile string) error { l.Infof("Detecting restore name") var name string @@ -589,14 +663,8 @@ func waitForPBMRestore(ctx context.Context, l logrus.FieldLogger, dsn string, re l.Infof("waiting for pbm restore: %s", name) - return waitForPBMDescribe(ctx, newDescribePollConfig(l, dsn, pbmCmdRestore, name, func(ctx context.Context) (describeInfo, error) { - var info describeInfo - if backupType == "physical" { - err := execPBMCommand(ctx, dsn, &info, "describe-restore", name, "--config="+confFile) - return info, err - } - err := execPBMCommand(ctx, dsn, &info, "describe-restore", name) - return info, err + return waitDescribe(ctx, newDescribePoller(l, dsn, pbmCmdRestore, name, func(ctx context.Context) (describeInfo, error) { + return fetchRestoreDescribe(ctx, dsn, name, backupType, confFile) })) } @@ -730,7 +798,7 @@ func createPBMConfig(locationConfig *BackupLocationConfig, prefix string, pitr b return conf, nil } -func groupDescribeErrors(info describeInfo) error { +func groupDescribeErrs(info describeInfo) error { var errMsgs []string if info.Error != "" { diff --git a/agent/runner/jobs/pbm_helpers_test.go b/agent/runner/jobs/pbm_helpers_test.go index 3221c3ace9d..f489670ee75 100644 --- a/agent/runner/jobs/pbm_helpers_test.go +++ b/agent/runner/jobs/pbm_helpers_test.go @@ -18,24 +18,26 @@ import ( "context" "errors" "os" + "strings" "testing" "time" "github.com/sirupsen/logrus" + logrustest "github.com/sirupsen/logrus/hooks/test" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) -func newTestDescribePollConfig(t *testing.T, opts ...func(*pbmDescribePollConfig)) *pbmDescribePollConfig { +func newTestPoller(t *testing.T, opts ...func(*describePoller)) *describePoller { t.Helper() - cfg := &pbmDescribePollConfig{ + cfg := &describePoller{ l: logrus.New(), dsn: "mongodb://localhost", operation: pbmCmdBackup, - targetName: "2024-01-01T00:00:00Z", + name: "2024-01-01T00:00:00Z", startedAt: time.Now(), - describeRetries: maxDescribeCommandRetries, + retries: maxDescribeRetries, fetchDescribe: func(context.Context) (describeInfo, error) { return describeInfo{Status: pbmStatusDone}, nil }, @@ -150,10 +152,10 @@ func TestCreatePBMConfig(t *testing.T) { } } -func TestIsTransientPBMDescribeError(t *testing.T) { +func TestIsTransientDescribeErr(t *testing.T) { t.Parallel() - assert.False(t, isTransientPBMDescribeError(nil)) + assert.False(t, isTransientDescribeErr(nil)) for _, tc := range []struct { name string @@ -180,6 +182,11 @@ func TestIsTransientPBMDescribeError(t *testing.T) { err: errors.New("get snapshot size: missed file"), want: true, }, + { + name: "backup meta permission denied", + err: errors.New("get backup meta: permission denied"), + want: false, + }, { name: "generic not found", err: errors.New("authentication failed: user not found"), @@ -193,142 +200,159 @@ func TestIsTransientPBMDescribeError(t *testing.T) { } { t.Run(tc.name, func(t *testing.T) { t.Parallel() - assert.Equal(t, tc.want, isTransientPBMDescribeError(tc.err)) + assert.Equal(t, tc.want, isTransientDescribeErr(tc.err)) }) } } -func TestOperationIsRunning(t *testing.T) { +func TestOpRunning(t *testing.T) { t.Parallel() - backupCfg := &pbmDescribePollConfig{ + backupCfg := &describePoller{ operation: pbmCmdBackup, - targetName: "backup-1", + name: "backup-1", } status := &pbmStatus{} status.Running.Type = pbmCmdBackup status.Running.Name = "backup-1" - assert.True(t, backupCfg.operationIsRunning(status)) + assert.True(t, backupCfg.opRunning(status)) - restoreCfg := &pbmDescribePollConfig{ + restoreCfg := &describePoller{ operation: pbmCmdRestore, - targetName: "restore-1", + name: "restore-1", } status.Running.Type = pbmCmdRestore status.Running.Name = "restore-1" - assert.True(t, restoreCfg.operationIsRunning(status)) + assert.True(t, restoreCfg.opRunning(status)) status.Running.Name = "restore-2" - assert.False(t, restoreCfg.operationIsRunning(status)) + assert.False(t, restoreCfg.opRunning(status)) - customCfg := &pbmDescribePollConfig{ + customCfg := &describePoller{ isRunning: func(*pbmStatus) bool { return true }, } - assert.True(t, customCfg.operationIsRunning(status)) + assert.True(t, customCfg.opRunning(status)) - unknownCfg := &pbmDescribePollConfig{operation: "unknown"} - assert.False(t, unknownCfg.operationIsRunning(status)) + unknownCfg := &describePoller{operation: "unknown"} + assert.False(t, unknownCfg.opRunning(status)) } -func TestSnapshotForTarget(t *testing.T) { +func TestTargetSnapshot(t *testing.T) { t.Parallel() - cfg := &pbmDescribePollConfig{ + cfg := &describePoller{ operation: pbmCmdBackup, - targetName: "snap-1", + name: "snap-1", } status := &pbmStatus{} status.Backups.Snapshot = []pbmSnapshot{{Name: "snap-1"}} - assert.NotNil(t, cfg.snapshotForTarget(status)) + assert.NotNil(t, cfg.targetSnapshot(status)) cfg.operation = pbmCmdRestore - assert.Nil(t, cfg.snapshotForTarget(status)) + assert.Nil(t, cfg.targetSnapshot(status)) } -func TestShouldRetryDescribeFailure(t *testing.T) { +func TestRetryTransient(t *testing.T) { t.Parallel() transientErr := errors.New("no such file") - assert.True(t, shouldRetryDescribeFailure(transientErr, time.Now())) - assert.False(t, shouldRetryDescribeFailure(transientErr, time.Now().Add(-pbmDescribeStartupGrace))) - assert.False(t, shouldRetryDescribeFailure(errors.New("permission denied"), time.Now())) + + t.Run("startup grace while running", func(t *testing.T) { + t.Parallel() + cfg := &describePoller{startedAt: time.Now()} + assert.True(t, retryTransient(transientErr, cfg, true)) + cfg.startedAt = time.Now().Add(-describeStartupGrace) + assert.False(t, retryTransient(transientErr, cfg, true)) + assert.False(t, retryTransient(errors.New("permission denied"), &describePoller{startedAt: time.Now()}, true)) + }) + + t.Run("completion grace after operation finished", func(t *testing.T) { + t.Parallel() + cfg := &describePoller{ + startedAt: time.Now().Add(-2 * time.Hour), + finishedAt: time.Now().Add(-1 * time.Minute), + } + assert.True(t, retryTransient(transientErr, cfg, false)) + cfg.finishedAt = time.Now().Add(-describeCompletionGrace) + assert.False(t, retryTransient(transientErr, cfg, false)) + }) } -func TestRetryDescribeCommand(t *testing.T) { +func TestRetryDescribeCmd(t *testing.T) { t.Parallel() - cfg := &pbmDescribePollConfig{ + cfg := &describePoller{ l: logrus.New(), operation: pbmCmdBackup, - describeRetries: 1, + retries: 1, } - assert.True(t, cfg.retryDescribeCommand(errors.New("temporary"))) - assert.Equal(t, 0, cfg.describeRetries) - assert.False(t, cfg.retryDescribeCommand(errors.New("temporary"))) + assert.True(t, cfg.retryDescribeCmd(errors.New("temporary"))) + assert.Equal(t, 0, cfg.retries) + assert.False(t, cfg.retryDescribeCmd(errors.New("temporary"))) } -func TestDescribeFailureError(t *testing.T) { +func TestDescribeErr(t *testing.T) { t.Parallel() - err := describeFailureError(describeInfo{Status: pbmStatusError}, pbmCmdBackup) + err := describeErr(describeInfo{Status: pbmStatusError}, pbmCmdBackup) require.EqualError(t, err, "backup failed") - err = describeFailureError(describeInfo{Status: pbmStatusError, Error: "oplog gap"}, pbmCmdBackup) + err = describeErr(describeInfo{Status: pbmStatusError, Error: "oplog gap"}, pbmCmdBackup) require.EqualError(t, err, "oplog gap") } -func TestGroupDescribeErrors_AllBranches(t *testing.T) { +func TestGroupDescribeErrs_AllBranches(t *testing.T) { t.Parallel() - err := groupDescribeErrors(describeInfo{}) + err := groupDescribeErrs(describeInfo{}) require.ErrorIs(t, err, errPBMOperationFailed) - err = groupDescribeErrors(describeInfo{Error: "top level"}) + err = groupDescribeErrs(describeInfo{Error: "top level"}) require.EqualError(t, err, "top level") - err = groupDescribeErrors(describeInfo{ + err = groupDescribeErrs(describeInfo{ ReplSets: []replSet{{Name: "rs0", Error: "rs failed"}}, }) require.EqualError(t, err, "replset: rs0, error: rs failed") } -func TestPollPBMDescribeOnce(t *testing.T) { +func TestPollDescribeOnce(t *testing.T) { t.Parallel() t.Run("describe done", func(t *testing.T) { t.Parallel() - done, err := pollPBMDescribeOnce(context.Background(), newTestDescribePollConfig(t)) + done, err := pollDescribeOnce(context.Background(), newTestPoller(t)) require.NoError(t, err) assert.True(t, done) }) t.Run("describe in progress", func(t *testing.T) { t.Parallel() - cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + cfg := newTestPoller(t, func(c *describePoller) { c.fetchDescribe = func(context.Context) (describeInfo, error) { return describeInfo{Status: "running"}, nil } }) - done, err := pollPBMDescribeOnce(context.Background(), cfg) + done, err := pollDescribeOnce(context.Background(), cfg) require.NoError(t, err) assert.False(t, done) }) t.Run("describe canceled", func(t *testing.T) { t.Parallel() - cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + cfg := newTestPoller(t, func(c *describePoller) { c.fetchDescribe = func(context.Context) (describeInfo, error) { return describeInfo{Status: pbmStatusCanceled}, nil } }) - done, err := pollPBMDescribeOnce(context.Background(), cfg) + done, err := pollDescribeOnce(context.Background(), cfg) require.EqualError(t, err, "backup was canceled") assert.True(t, done) }) t.Run("describe partly done", func(t *testing.T) { t.Parallel() - cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + cfg := newTestPoller(t, func(c *describePoller) { c.fetchDescribe = func(context.Context) (describeInfo, error) { return describeInfo{ Status: pbmStatusPartlyDone, @@ -344,14 +368,14 @@ func TestPollPBMDescribeOnce(t *testing.T) { }, nil } }) - done, err := pollPBMDescribeOnce(context.Background(), cfg) + done, err := pollDescribeOnce(context.Background(), cfg) require.EqualError(t, err, "replset: rs0, node: node1, error: failed node") assert.True(t, done) }) t.Run("status fetch error", func(t *testing.T) { t.Parallel() - cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + cfg := newTestPoller(t, func(c *describePoller) { c.fetchDescribe = func(context.Context) (describeInfo, error) { return describeInfo{}, errors.New("describe failed") } @@ -359,152 +383,152 @@ func TestPollPBMDescribeOnce(t *testing.T) { return nil, errors.New("status unavailable") } }) - done, err := pollPBMDescribeOnce(context.Background(), cfg) + done, err := pollDescribeOnce(context.Background(), cfg) require.ErrorContains(t, err, "failed to get pbm status") assert.False(t, done) }) t.Run("running backup with transient describe error", func(t *testing.T) { t.Parallel() - cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + cfg := newTestPoller(t, func(c *describePoller) { c.fetchDescribe = func(context.Context) (describeInfo, error) { return describeInfo{}, errors.New("no such file") } c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { status := &pbmStatus{} status.Running.Type = pbmCmdBackup - status.Running.Name = c.targetName + status.Running.Name = c.name return status, nil } }) - done, err := pollPBMDescribeOnce(context.Background(), cfg) + done, err := pollDescribeOnce(context.Background(), cfg) require.NoError(t, err) assert.False(t, done) }) t.Run("running backup retries describe command", func(t *testing.T) { t.Parallel() - cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { - c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) - c.describeRetries = maxDescribeCommandRetries + cfg := newTestPoller(t, func(c *describePoller) { + c.startedAt = time.Now().Add(-describeStartupGrace) + c.retries = maxDescribeRetries c.fetchDescribe = func(context.Context) (describeInfo, error) { return describeInfo{}, errors.New("permission denied") } c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { status := &pbmStatus{} status.Running.Type = pbmCmdBackup - status.Running.Name = c.targetName + status.Running.Name = c.name return status, nil } }) - done, err := pollPBMDescribeOnce(context.Background(), cfg) + done, err := pollDescribeOnce(context.Background(), cfg) require.NoError(t, err) assert.False(t, done) - assert.Equal(t, maxDescribeCommandRetries-1, cfg.describeRetries) + assert.Equal(t, maxDescribeRetries-1, cfg.retries) }) t.Run("snapshot done when describe fails", func(t *testing.T) { t.Parallel() - cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + cfg := newTestPoller(t, func(c *describePoller) { c.fetchDescribe = func(context.Context) (describeInfo, error) { return describeInfo{}, errors.New("no such file") } - c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) + c.startedAt = time.Now().Add(-describeStartupGrace) c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { status := &pbmStatus{} status.Backups.Snapshot = []pbmSnapshot{{ - Name: c.targetName, + Name: c.name, Status: pbmStatusDone, }} return status, nil } c.findSnapshot = func(status *pbmStatus) *pbmSnapshot { - return findPBMSnapshot(status, c.targetName) + return snapshotByName(status, c.name) } }) - done, err := pollPBMDescribeOnce(context.Background(), cfg) + done, err := pollDescribeOnce(context.Background(), cfg) require.NoError(t, err) assert.True(t, done) }) t.Run("snapshot terminal error", func(t *testing.T) { t.Parallel() - cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + cfg := newTestPoller(t, func(c *describePoller) { c.fetchDescribe = func(context.Context) (describeInfo, error) { return describeInfo{}, errors.New("no such file") } - c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) + c.startedAt = time.Now().Add(-describeStartupGrace) c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { status := &pbmStatus{} status.Backups.Snapshot = []pbmSnapshot{{ - Name: c.targetName, + Name: c.name, Status: pbmStatusError, Error: "storage error", }} return status, nil } c.findSnapshot = func(status *pbmStatus) *pbmSnapshot { - return findPBMSnapshot(status, c.targetName) + return snapshotByName(status, c.name) } }) - done, err := pollPBMDescribeOnce(context.Background(), cfg) + done, err := pollDescribeOnce(context.Background(), cfg) require.EqualError(t, err, "storage error") assert.True(t, done) }) t.Run("restore done when describe fails", func(t *testing.T) { t.Parallel() - cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + cfg := newTestPoller(t, func(c *describePoller) { c.operation = pbmCmdRestore - c.targetName = "2024-01-01T12:00:00Z" + c.name = "2024-01-01T12:00:00Z" c.fetchDescribe = func(context.Context) (describeInfo, error) { return describeInfo{}, errors.New("no such file") } - c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) + c.startedAt = time.Now().Add(-describeStartupGrace) c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { return &pbmStatus{}, nil } c.fetchRestoreList = func(context.Context) ([]pbmListRestore, error) { return []pbmListRestore{{ - Name: c.targetName, + Name: c.name, Status: pbmStatusDone, }}, nil } }) - done, err := pollPBMDescribeOnce(context.Background(), cfg) + done, err := pollDescribeOnce(context.Background(), cfg) require.NoError(t, err) assert.True(t, done) }) t.Run("restore terminal error when describe fails", func(t *testing.T) { t.Parallel() - cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + cfg := newTestPoller(t, func(c *describePoller) { c.operation = pbmCmdRestore - c.targetName = "2024-01-01T12:00:00Z" + c.name = "2024-01-01T12:00:00Z" c.fetchDescribe = func(context.Context) (describeInfo, error) { return describeInfo{}, errors.New("permission denied") } - c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) - c.describeRetries = 0 + c.startedAt = time.Now().Add(-describeStartupGrace) + c.retries = 0 c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { return &pbmStatus{}, nil } c.fetchRestoreList = func(context.Context) ([]pbmListRestore, error) { return []pbmListRestore{{ - Name: c.targetName, + Name: c.name, Status: pbmStatusError, Error: "node copy failed", }}, nil } }) - done, err := pollPBMDescribeOnce(context.Background(), cfg) + done, err := pollDescribeOnce(context.Background(), cfg) require.EqualError(t, err, "node copy failed") assert.True(t, done) }) t.Run("startup grace for transient error", func(t *testing.T) { t.Parallel() - cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { + cfg := newTestPoller(t, func(c *describePoller) { c.fetchDescribe = func(context.Context) (describeInfo, error) { return describeInfo{}, errors.New("file is empty") } @@ -512,54 +536,56 @@ func TestPollPBMDescribeOnce(t *testing.T) { return &pbmStatus{}, nil } }) - done, err := pollPBMDescribeOnce(context.Background(), cfg) + done, err := pollDescribeOnce(context.Background(), cfg) require.NoError(t, err) assert.False(t, done) }) t.Run("retries after startup grace when retries remain", func(t *testing.T) { t.Parallel() - cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { - c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) - c.describeRetries = 2 + cfg := newTestPoller(t, func(c *describePoller) { + c.startedAt = time.Now().Add(-describeStartupGrace) + c.finishedAt = time.Now().Add(-describeCompletionGrace) + c.retries = 2 c.fetchDescribe = func(context.Context) (describeInfo, error) { - return describeInfo{}, errors.New("file is empty") + return describeInfo{}, errors.New("permission denied") } c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { return &pbmStatus{}, nil } }) - done, err := pollPBMDescribeOnce(context.Background(), cfg) + done, err := pollDescribeOnce(context.Background(), cfg) require.NoError(t, err) assert.False(t, done) - assert.Equal(t, 1, cfg.describeRetries) + assert.Equal(t, 1, cfg.retries) }) t.Run("running backup with exhausted retries keeps waiting", func(t *testing.T) { t.Parallel() - cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { - c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) - c.describeRetries = 0 + cfg := newTestPoller(t, func(c *describePoller) { + c.startedAt = time.Now().Add(-describeStartupGrace) + c.retries = 0 c.fetchDescribe = func(context.Context) (describeInfo, error) { return describeInfo{}, errors.New("permission denied") } c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { status := &pbmStatus{} status.Running.Type = pbmCmdBackup - status.Running.Name = c.targetName + status.Running.Name = c.name return status, nil } }) - done, err := pollPBMDescribeOnce(context.Background(), cfg) + done, err := pollDescribeOnce(context.Background(), cfg) require.NoError(t, err) assert.False(t, done) }) t.Run("describe failure without running backup", func(t *testing.T) { t.Parallel() - cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { - c.startedAt = time.Now().Add(-pbmDescribeStartupGrace) - c.describeRetries = 0 + cfg := newTestPoller(t, func(c *describePoller) { + c.startedAt = time.Now().Add(-describeCompletionGrace) + c.finishedAt = time.Now().Add(-describeCompletionGrace) + c.retries = 0 c.fetchDescribe = func(context.Context) (describeInfo, error) { return describeInfo{}, errors.New("permission denied") } @@ -567,37 +593,107 @@ func TestPollPBMDescribeOnce(t *testing.T) { return &pbmStatus{}, nil } }) - done, err := pollPBMDescribeOnce(context.Background(), cfg) + done, err := pollDescribeOnce(context.Background(), cfg) require.ErrorContains(t, err, "failed to get backup status") assert.False(t, done) }) + + t.Run("restore list fetch error keeps polling", func(t *testing.T) { + t.Parallel() + cfg := newTestPoller(t, func(c *describePoller) { + c.operation = pbmCmdRestore + c.name = "2024-01-01T12:00:00Z" + c.startedAt = time.Now().Add(-describeStartupGrace) + c.finishedAt = time.Now().Add(-30 * time.Second) + c.retries = 0 + c.fetchDescribe = func(context.Context) (describeInfo, error) { + return describeInfo{}, errors.New("no such file") + } + c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { + return &pbmStatus{}, nil + } + c.fetchRestoreList = func(context.Context) ([]pbmListRestore, error) { + return nil, errors.New("list unavailable") + } + }) + done, err := pollDescribeOnce(context.Background(), cfg) + require.NoError(t, err) + assert.False(t, done) + }) + + t.Run("transient error after operation finished uses completion grace", func(t *testing.T) { + t.Parallel() + cfg := newTestPoller(t, func(c *describePoller) { + c.startedAt = time.Now().Add(-2 * time.Hour) + c.finishedAt = time.Now().Add(-30 * time.Second) + c.retries = 0 + c.fetchDescribe = func(context.Context) (describeInfo, error) { + return describeInfo{}, errors.New("file is empty") + } + c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { + return &pbmStatus{}, nil + } + }) + done, err := pollDescribeOnce(context.Background(), cfg) + require.NoError(t, err) + assert.False(t, done) + }) + + t.Run("warns when describe keeps failing while running", func(t *testing.T) { + t.Parallel() + logger, hook := logrustest.NewNullLogger() + cfg := newTestPoller(t, func(c *describePoller) { + c.l = logger + c.startedAt = time.Now().Add(-describeRunningWarnInterval - time.Second) + c.fetchDescribe = func(context.Context) (describeInfo, error) { + return describeInfo{}, errors.New("permission denied") + } + c.fetchStatus = func(context.Context, string) (*pbmStatus, error) { + status := &pbmStatus{} + status.Running.Type = pbmCmdBackup + status.Running.Name = c.name + return status, nil + } + }) + done, err := pollDescribeOnce(context.Background(), cfg) + require.NoError(t, err) + assert.False(t, done) + var found bool + for _, entry := range hook.Entries { + if entry.Level == logrus.WarnLevel && strings.Contains(entry.Message, "still running") { + found = true + break + } + } + assert.True(t, found) + }) } -func TestWaitForPBMDescribe(t *testing.T) { +func TestWaitDescribe(t *testing.T) { t.Run("completes when describe reports done", func(t *testing.T) { - cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { - c.pollInterval = time.Millisecond + cfg := newTestPoller(t, func(c *describePoller) { + c.pollEvery = time.Millisecond }) - err := waitForPBMDescribe(context.Background(), cfg) + err := waitDescribe(context.Background(), cfg) require.NoError(t, err) }) t.Run("returns describe error", func(t *testing.T) { - cfg := newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { - c.pollInterval = time.Millisecond + cfg := newTestPoller(t, func(c *describePoller) { + c.pollEvery = time.Millisecond c.fetchDescribe = func(context.Context) (describeInfo, error) { return describeInfo{Status: pbmStatusCanceled}, nil } }) - err := waitForPBMDescribe(context.Background(), cfg) + err := waitDescribe(context.Background(), cfg) require.EqualError(t, err, "backup was canceled") }) t.Run("context canceled", func(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) cancel() - err := waitForPBMDescribe(ctx, newTestDescribePollConfig(t, func(c *pbmDescribePollConfig) { - c.pollInterval = time.Millisecond + err := waitDescribe(ctx, newTestPoller(t, func(c *describePoller) { + c.pollEvery = time.Millisecond c.fetchDescribe = func(context.Context) (describeInfo, error) { return describeInfo{Status: "running"}, nil } @@ -625,59 +721,59 @@ func TestWritePBMConfigFile(t *testing.T) { require.FileExists(t, path) } -func TestDescribeTerminalError(t *testing.T) { +func TestCheckDescribe(t *testing.T) { t.Parallel() - done, err := describeTerminalError(describeInfo{Status: pbmStatusDone}, pbmCmdBackup) + done, err := checkDescribe(describeInfo{Status: pbmStatusDone}, pbmCmdBackup) require.NoError(t, err) assert.True(t, done) - done, err = describeTerminalError(describeInfo{Status: pbmStatusCanceled}, pbmCmdBackup) + done, err = checkDescribe(describeInfo{Status: pbmStatusCanceled}, pbmCmdBackup) require.EqualError(t, err, "backup was canceled") assert.True(t, done) - done, err = describeTerminalError(describeInfo{Status: pbmStatusError, Error: "oplog has insufficient range"}, pbmCmdBackup) + done, err = checkDescribe(describeInfo{Status: pbmStatusError, Error: "oplog has insufficient range"}, pbmCmdBackup) require.EqualError(t, err, "oplog has insufficient range") assert.True(t, done) - done, err = describeTerminalError(describeInfo{Status: pbmStatusPartlyDone, Error: "partial"}, pbmCmdBackup) + done, err = checkDescribe(describeInfo{Status: pbmStatusPartlyDone, Error: "partial"}, pbmCmdBackup) require.EqualError(t, err, "partial") assert.True(t, done) - done, err = describeTerminalError(describeInfo{Status: "running"}, pbmCmdBackup) + done, err = checkDescribe(describeInfo{Status: "running"}, pbmCmdBackup) require.NoError(t, err) assert.False(t, done) } -func TestTerminalStatusError(t *testing.T) { +func TestCheckStatus(t *testing.T) { t.Parallel() - done, err := terminalStatusError(pbmStatusDone, "", pbmCmdBackup) + done, err := checkStatus(pbmStatusDone, "", pbmCmdBackup) require.NoError(t, err) assert.True(t, done) - done, err = terminalStatusError(pbmStatusCanceled, "", pbmCmdBackup) + done, err = checkStatus(pbmStatusCanceled, "", pbmCmdBackup) require.EqualError(t, err, "backup was canceled") assert.True(t, done) - done, err = terminalStatusError(pbmStatusError, "storage unavailable", pbmCmdBackup) + done, err = checkStatus(pbmStatusError, "storage unavailable", pbmCmdBackup) require.EqualError(t, err, "storage unavailable") assert.True(t, done) - done, err = terminalStatusError(pbmStatusError, "", pbmCmdBackup) + done, err = checkStatus(pbmStatusError, "", pbmCmdBackup) require.EqualError(t, err, "backup failed") assert.True(t, done) - done, err = terminalStatusError(pbmStatusPartlyDone, "", pbmCmdRestore) + done, err = checkStatus(pbmStatusPartlyDone, "", pbmCmdRestore) require.EqualError(t, err, "restore partly completed") assert.True(t, done) - done, err = terminalStatusError("running", "", pbmCmdBackup) + done, err = checkStatus("running", "", pbmCmdBackup) require.NoError(t, err) assert.False(t, done) } -func TestFindPBMListRestore(t *testing.T) { +func TestRestoreByName(t *testing.T) { t.Parallel() list := []pbmListRestore{ @@ -685,11 +781,11 @@ func TestFindPBMListRestore(t *testing.T) { {Name: "2024-01-02T00:00:00Z", Status: pbmStatusError, Error: "failed"}, } - assert.Nil(t, findPBMListRestore(list, "missing")) - require.NotNil(t, findPBMListRestore(list, "2024-01-02T00:00:00Z")) + assert.Nil(t, restoreByName(list, "missing")) + require.NotNil(t, restoreByName(list, "2024-01-02T00:00:00Z")) } -func TestFindPBMSnapshot(t *testing.T) { +func TestSnapshotByName(t *testing.T) { t.Parallel() status := &pbmStatus{} @@ -698,14 +794,14 @@ func TestFindPBMSnapshot(t *testing.T) { {Name: "2024-01-02T00:00:00Z", Status: pbmStatusError, Error: "failed"}, } - assert.Nil(t, findPBMSnapshot(status, "missing")) - require.NotNil(t, findPBMSnapshot(status, "2024-01-02T00:00:00Z")) + assert.Nil(t, snapshotByName(status, "missing")) + require.NotNil(t, snapshotByName(status, "2024-01-02T00:00:00Z")) } -func TestGroupDescribeErrors(t *testing.T) { +func TestGroupDescribeErrs(t *testing.T) { t.Parallel() - err := groupDescribeErrors(describeInfo{ + err := groupDescribeErrs(describeInfo{ Status: pbmStatusPartlyDone, ReplSets: []replSet{{ Name: "rs0", From 9011f9d88aa58eb968a1399fa0ff281b3b3f40e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Tue, 16 Jun 2026 10:26:55 +0200 Subject: [PATCH 08/16] PMM-14576 Format. --- agent/runner/jobs/pbm_helpers.go | 28 ++++++++++++------------- agent/runner/jobs/pbm_helpers_test.go | 30 +++++++++++++-------------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/agent/runner/jobs/pbm_helpers.go b/agent/runner/jobs/pbm_helpers.go index f1c988af4a4..f5a44ec2399 100644 --- a/agent/runner/jobs/pbm_helpers.go +++ b/agent/runner/jobs/pbm_helpers.go @@ -40,7 +40,7 @@ const ( // PBM waits up to ~33s for backup metadata. Allow extra margin for describe-backup. describeStartupGrace = 60 * time.Second // After the operation stops, status/list metadata can lag behind describe failures. - describeCompletionGrace = 5 * time.Minute + describeCompletionGrace = 5 * time.Minute describeRunningWarnInterval = 5 * time.Minute pbmCmdBackup = "backup" @@ -274,20 +274,20 @@ func getPBMStatus(ctx context.Context, dsn string) (*pbmStatus, error) { } type describePoller struct { - l logrus.FieldLogger - dsn string - operation string - name string - startedAt time.Time - finishedAt time.Time - lastRunningWarn time.Time - pollEvery time.Duration - retries int - fetchDescribe func(context.Context) (describeInfo, error) - fetchStatus func(context.Context, string) (*pbmStatus, error) + l logrus.FieldLogger + dsn string + operation string + name string + startedAt time.Time + finishedAt time.Time + lastRunningWarn time.Time + pollEvery time.Duration + retries int + fetchDescribe func(context.Context) (describeInfo, error) + fetchStatus func(context.Context, string) (*pbmStatus, error) fetchRestoreList func(context.Context) ([]pbmListRestore, error) - isRunning func(*pbmStatus) bool - findSnapshot func(*pbmStatus) *pbmSnapshot + isRunning func(*pbmStatus) bool + findSnapshot func(*pbmStatus) *pbmSnapshot } func (cfg *describePoller) interval() time.Duration { diff --git a/agent/runner/jobs/pbm_helpers_test.go b/agent/runner/jobs/pbm_helpers_test.go index f489670ee75..9a644bf96a2 100644 --- a/agent/runner/jobs/pbm_helpers_test.go +++ b/agent/runner/jobs/pbm_helpers_test.go @@ -32,12 +32,12 @@ func newTestPoller(t *testing.T, opts ...func(*describePoller)) *describePoller t.Helper() cfg := &describePoller{ - l: logrus.New(), - dsn: "mongodb://localhost", - operation: pbmCmdBackup, + l: logrus.New(), + dsn: "mongodb://localhost", + operation: pbmCmdBackup, name: "2024-01-01T00:00:00Z", - startedAt: time.Now(), - retries: maxDescribeRetries, + startedAt: time.Now(), + retries: maxDescribeRetries, fetchDescribe: func(context.Context) (describeInfo, error) { return describeInfo{Status: pbmStatusDone}, nil }, @@ -209,8 +209,8 @@ func TestOpRunning(t *testing.T) { t.Parallel() backupCfg := &describePoller{ - operation: pbmCmdBackup, - name: "backup-1", + operation: pbmCmdBackup, + name: "backup-1", } status := &pbmStatus{} status.Running.Type = pbmCmdBackup @@ -218,8 +218,8 @@ func TestOpRunning(t *testing.T) { assert.True(t, backupCfg.opRunning(status)) restoreCfg := &describePoller{ - operation: pbmCmdRestore, - name: "restore-1", + operation: pbmCmdRestore, + name: "restore-1", } status.Running.Type = pbmCmdRestore status.Running.Name = "restore-1" @@ -240,8 +240,8 @@ func TestTargetSnapshot(t *testing.T) { t.Parallel() cfg := &describePoller{ - operation: pbmCmdBackup, - name: "snap-1", + operation: pbmCmdBackup, + name: "snap-1", } status := &pbmStatus{} status.Backups.Snapshot = []pbmSnapshot{{Name: "snap-1"}} @@ -268,7 +268,7 @@ func TestRetryTransient(t *testing.T) { t.Run("completion grace after operation finished", func(t *testing.T) { t.Parallel() cfg := &describePoller{ - startedAt: time.Now().Add(-2 * time.Hour), + startedAt: time.Now().Add(-2 * time.Hour), finishedAt: time.Now().Add(-1 * time.Minute), } assert.True(t, retryTransient(transientErr, cfg, false)) @@ -281,9 +281,9 @@ func TestRetryDescribeCmd(t *testing.T) { t.Parallel() cfg := &describePoller{ - l: logrus.New(), - operation: pbmCmdBackup, - retries: 1, + l: logrus.New(), + operation: pbmCmdBackup, + retries: 1, } assert.True(t, cfg.retryDescribeCmd(errors.New("temporary"))) From ef0ca4d77259c5eb1ce49020929e5c8616fdeac4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Tue, 16 Jun 2026 10:46:39 +0200 Subject: [PATCH 09/16] PMM-14576 Running backup not consume retries. --- agent/runner/jobs/pbm_helpers.go | 27 ++++++++++++++++----------- agent/runner/jobs/pbm_helpers_test.go | 4 ++-- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/agent/runner/jobs/pbm_helpers.go b/agent/runner/jobs/pbm_helpers.go index f5a44ec2399..616840e5e02 100644 --- a/agent/runner/jobs/pbm_helpers.go +++ b/agent/runner/jobs/pbm_helpers.go @@ -361,7 +361,7 @@ func pollDescribeOnce(ctx context.Context, cfg *describePoller) (bool, error) { if running { cfg.warnRunningDescribe(describeErr) - cfg.retryDescribeErr(describeErr, true) + cfg.logRunningDescribeErr(describeErr) return false, nil } @@ -369,7 +369,7 @@ func pollDescribeOnce(ctx context.Context, cfg *describePoller) (bool, error) { return true, err } - if cfg.retryDescribeErr(describeErr, false) { + if cfg.retryDescribeErr(describeErr) { return false, nil } @@ -416,15 +416,20 @@ func (cfg *describePoller) lastWarnAt() time.Time { return cfg.startedAt } -func (cfg *describePoller) retryDescribeErr(describeErr error, running bool) bool { - if retryTransient(describeErr, cfg, running) { - if running { - cfg.l.Debugf("%s transient error while %s %q is still running: %v", - cfg.describeCmd(), cfg.operation, cfg.name, describeErr) - } else { - cfg.l.Debugf("%s transient error while waiting for %s %q completion metadata: %v", - cfg.describeCmd(), cfg.operation, cfg.name, describeErr) - } +func (cfg *describePoller) logRunningDescribeErr(describeErr error) { + if retryTransient(describeErr, cfg, true) { + cfg.l.Debugf("%s transient error while %s %q is still running: %v", + cfg.describeCmd(), cfg.operation, cfg.name, describeErr) + return + } + cfg.l.Debugf("%s error while %s %q is still running: %v", + cfg.describeCmd(), cfg.operation, cfg.name, describeErr) +} + +func (cfg *describePoller) retryDescribeErr(describeErr error) bool { + if retryTransient(describeErr, cfg, false) { + cfg.l.Debugf("%s transient error while waiting for %s %q completion metadata: %v", + cfg.describeCmd(), cfg.operation, cfg.name, describeErr) return true } return cfg.retryDescribeCmd(describeErr) diff --git a/agent/runner/jobs/pbm_helpers_test.go b/agent/runner/jobs/pbm_helpers_test.go index 9a644bf96a2..2e01eda88ad 100644 --- a/agent/runner/jobs/pbm_helpers_test.go +++ b/agent/runner/jobs/pbm_helpers_test.go @@ -406,7 +406,7 @@ func TestPollDescribeOnce(t *testing.T) { assert.False(t, done) }) - t.Run("running backup retries describe command", func(t *testing.T) { + t.Run("running backup does not consume retries", func(t *testing.T) { t.Parallel() cfg := newTestPoller(t, func(c *describePoller) { c.startedAt = time.Now().Add(-describeStartupGrace) @@ -424,7 +424,7 @@ func TestPollDescribeOnce(t *testing.T) { done, err := pollDescribeOnce(context.Background(), cfg) require.NoError(t, err) assert.False(t, done) - assert.Equal(t, maxDescribeRetries-1, cfg.retries) + assert.Equal(t, maxDescribeRetries, cfg.retries) }) t.Run("snapshot done when describe fails", func(t *testing.T) { From c3050f48660f8ef51c033674cdb5cedad99ed301 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Tue, 16 Jun 2026 10:47:41 +0200 Subject: [PATCH 10/16] PMM-14576 Lint. --- agent/runner/jobs/pbm_helpers.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/agent/runner/jobs/pbm_helpers.go b/agent/runner/jobs/pbm_helpers.go index 616840e5e02..96fd3d597ee 100644 --- a/agent/runner/jobs/pbm_helpers.go +++ b/agent/runner/jobs/pbm_helpers.go @@ -324,7 +324,8 @@ func waitDescribe(ctx context.Context, cfg *describePoller) error { defer ticker.Stop() for { - if err := ctx.Err(); err != nil { + err := ctx.Err() + if err != nil { return err } @@ -615,13 +616,14 @@ func findPITRRestoreName(ctx context.Context, dsn string, restoreInfo *pbmRestor checks := 0 for { - if err := ctx.Err(); err != nil { + err = ctx.Err() + if err != nil { return "", err } checks++ var list []pbmListRestore - err := execPBMCommand(ctx, dsn, &list, "list", "--restore") + err = execPBMCommand(ctx, dsn, &list, "list", "--restore") if err != nil { return "", errors.Wrapf(err, "pbm status error") } From c36457ca50e4e690db2ca7d14ef64b775ae86afe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Mon, 22 Jun 2026 10:44:27 +0200 Subject: [PATCH 11/16] PMM-14576 Max's suggestion. --- agent/runner/jobs/pbm_helpers.go | 114 +++++++++++++------------------ agent/utils/poll/poll.go | 53 ++++++++++++++ agent/utils/poll/poll_test.go | 87 +++++++++++++++++++++++ 3 files changed, 186 insertions(+), 68 deletions(-) create mode 100644 agent/utils/poll/poll.go create mode 100644 agent/utils/poll/poll_test.go diff --git a/agent/runner/jobs/pbm_helpers.go b/agent/runner/jobs/pbm_helpers.go index cd6a1921a67..227ffc40409 100644 --- a/agent/runner/jobs/pbm_helpers.go +++ b/agent/runner/jobs/pbm_helpers.go @@ -28,6 +28,8 @@ import ( "github.com/sirupsen/logrus" "gopkg.in/yaml.v3" + + "github.com/percona/pmm/agent/utils/poll" ) const ( @@ -232,23 +234,13 @@ func retrieveLogs(ctx context.Context, dsn string, event string) ([]pbmLogEntry, func waitForPBMNoRunningOperations(ctx context.Context, l logrus.FieldLogger, dsn string) error { l.Info("Waiting for no running pbm operations.") - ticker := time.NewTicker(statusCheckInterval) - defer ticker.Stop() - - for { - select { - case <-ticker.C: - status, err := getPBMStatus(ctx, dsn) - if err != nil { - return err - } - if status.Running.Type == "" { - return nil - } - case <-ctx.Done(): - return ctx.Err() + return poll.PollUntilContextTimeout(ctx, statusCheckInterval, func(ctx context.Context) (bool, error) { + status, err := getPBMStatus(ctx, dsn) + if err != nil { + return false, err } - } + return status.Running.Type == "", nil + }) } func isShardedCluster(ctx context.Context, dsn string) (bool, error) { @@ -320,29 +312,18 @@ func waitForPBMBackup(ctx context.Context, l logrus.FieldLogger, dsn string, nam } func waitDescribe(ctx context.Context, cfg *describePoller) error { - ticker := time.NewTicker(cfg.interval()) - defer ticker.Stop() - - for { + return poll.PollUntilContextTimeout(ctx, cfg.interval(), func(ctx context.Context) (bool, error) { err := ctx.Err() if err != nil { - return err + return false, err } done, err := pollDescribeOnce(ctx, cfg) if err != nil { - return err + return false, err } - if done { - return nil - } - - select { - case <-ticker.C: - case <-ctx.Done(): - return ctx.Err() - } - } + return done, nil + }) } func pollDescribeOnce(ctx context.Context, cfg *describePoller) (bool, error) { @@ -611,36 +592,35 @@ func findPITRRestoreName(ctx context.Context, dsn string, restoreInfo *pbmRestor return "", err } - ticker := time.NewTicker(statusCheckInterval) - defer ticker.Stop() - + var name string checks := 0 - for { + err = poll.PollUntilContextTimeout(ctx, statusCheckInterval, func(ctx context.Context) (bool, error) { err = ctx.Err() if err != nil { - return "", err + return false, err } checks++ var list []pbmListRestore err = execPBMCommand(ctx, dsn, &list, "list", "--restore") if err != nil { - return "", errors.Wrapf(err, "pbm status error") + return false, errors.Wrapf(err, "pbm status error") } entry := findPITRRestore(list, restoreInfoPITRTime.Unix(), restoreInfo.StartedAt) if entry != nil { - return entry.Name, nil + name = entry.Name + return true, nil } if checks > maxRestoreChecks { - return "", errors.Errorf("failed to start restore") - } - - select { - case <-ticker.C: - case <-ctx.Done(): - return "", ctx.Err() + return false, errors.Errorf("failed to start restore") } + return false, nil + }) + if err != nil { + return "", err } + + return name, nil } func fetchRestoreDescribe(ctx context.Context, dsn, name, backupType, confFile string) (describeInfo, error) { @@ -850,31 +830,29 @@ func pbmGetSnapshotTimestamp(ctx context.Context, l logrus.FieldLogger, dsn stri // getSnapshots returns all PBM snapshots found in configured location. func getSnapshots(ctx context.Context, l logrus.FieldLogger, dsn string) ([]pbmSnapshot, error) { // Sometimes PBM returns empty list of snapshots, that's why we're trying to get them several times. - ticker := time.NewTicker(listCheckInterval) - defer ticker.Stop() - + var snapshots []pbmSnapshot checks := 0 - for { - select { - case <-ticker.C: - checks++ - status, err := getPBMStatus(ctx, dsn) - if err != nil { - return nil, err - } + err := poll.PollUntilContextTimeout(ctx, listCheckInterval, func(ctx context.Context) (bool, error) { + checks++ + status, err := getPBMStatus(ctx, dsn) + if err != nil { + return false, err + } - if len(status.Backups.Snapshot) == 0 { - l.Debugf("Attempt %d to get a list of PBM artifacts has failed.", checks) - if checks > maxListChecks { - return nil, fmt.Errorf("got no one snapshot: %w", ErrNotFound) - } - continue + if len(status.Backups.Snapshot) == 0 { + l.Debugf("Attempt %d to get a list of PBM artifacts has failed.", checks) + if checks > maxListChecks { + return false, fmt.Errorf("got no one snapshot: %w", ErrNotFound) } - - return status.Backups.Snapshot, nil - - case <-ctx.Done(): - return nil, ctx.Err() + return false, nil } + + snapshots = status.Backups.Snapshot + return true, nil + }) + if err != nil { + return nil, err } + + return snapshots, nil } diff --git a/agent/utils/poll/poll.go b/agent/utils/poll/poll.go new file mode 100644 index 00000000000..5fc17cab3ee --- /dev/null +++ b/agent/utils/poll/poll.go @@ -0,0 +1,53 @@ +// Copyright (C) 2023 Percona LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package poll provides helpers for polling until a condition is met. +package poll + +import ( + "context" + "time" +) + +// ConditionFunc returns true when the condition is successfully met. +type ConditionFunc func(ctx context.Context) (done bool, err error) + +// PollUntilContextTimeout polls until condition returns done=true, err!=nil, or ctx is canceled. +func PollUntilContextTimeout(ctx context.Context, interval time.Duration, condition ConditionFunc) error { + done, err := condition(ctx) + if err != nil { + return err + } + if done { + return nil + } + + ticker := time.NewTicker(interval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + done, err := condition(ctx) + if err != nil { + return err + } + if done { + return nil + } + } + } +} diff --git a/agent/utils/poll/poll_test.go b/agent/utils/poll/poll_test.go new file mode 100644 index 00000000000..8c314bc1673 --- /dev/null +++ b/agent/utils/poll/poll_test.go @@ -0,0 +1,87 @@ +// Copyright (C) 2023 Percona LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package poll + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestPollUntilContextTimeout(t *testing.T) { + t.Parallel() + + t.Run("immediate success", func(t *testing.T) { + t.Parallel() + + calls := 0 + err := PollUntilContextTimeout(t.Context(), time.Millisecond, func(context.Context) (bool, error) { + calls++ + return true, nil + }) + require.NoError(t, err) + assert.Equal(t, 1, calls) + }) + + t.Run("success after retries", func(t *testing.T) { + t.Parallel() + + calls := 0 + err := PollUntilContextTimeout(t.Context(), time.Millisecond, func(context.Context) (bool, error) { + calls++ + return calls == 3, nil + }) + require.NoError(t, err) + assert.Equal(t, 3, calls) + }) + + t.Run("condition error", func(t *testing.T) { + t.Parallel() + + expected := errors.New("boom") + err := PollUntilContextTimeout(t.Context(), time.Millisecond, func(context.Context) (bool, error) { + return false, expected + }) + assert.ErrorIs(t, err, expected) + }) + + t.Run("context canceled", func(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithCancel(t.Context()) + cancel() + + err := PollUntilContextTimeout(ctx, time.Millisecond, func(context.Context) (bool, error) { + return false, nil + }) + assert.ErrorIs(t, err, context.Canceled) + }) + + t.Run("context timeout", func(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithTimeout(t.Context(), 20*time.Millisecond) + defer cancel() + + err := PollUntilContextTimeout(ctx, 5*time.Millisecond, func(context.Context) (bool, error) { + return false, nil + }) + assert.ErrorIs(t, err, context.DeadlineExceeded) + }) +} From 416cca1a9fd22fe8015891a64eaf4f2e845f91e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Mon, 22 Jun 2026 11:14:52 +0200 Subject: [PATCH 12/16] PMM-14576 Fix after conflicts. --- agent/runner/jobs/pbm_helpers.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/agent/runner/jobs/pbm_helpers.go b/agent/runner/jobs/pbm_helpers.go index 227ffc40409..92d628abf72 100644 --- a/agent/runner/jobs/pbm_helpers.go +++ b/agent/runner/jobs/pbm_helpers.go @@ -335,7 +335,7 @@ func pollDescribeOnce(ctx context.Context, cfg *describePoller) (bool, error) { status, statusErr := cfg.getPBMStatus(ctx) if statusErr != nil { - return false, errors.Wrap(statusErr, "failed to get pbm status") + return false, fmt.Errorf("failed to get pbm status: %w", statusErr) } running := cfg.opRunning(status) @@ -355,7 +355,7 @@ func pollDescribeOnce(ctx context.Context, cfg *describePoller) (bool, error) { return false, nil } - return false, errors.Wrapf(describeErr, "failed to get %s status", cfg.operation) + return false, fmt.Errorf("failed to get %s status: %w", cfg.operation, describeErr) } func (cfg *describePoller) getPBMStatus(ctx context.Context) (*pbmStatus, error) { @@ -540,17 +540,17 @@ func checkStatus(status, errMsg, operation string) (bool, error) { case pbmStatusDone: return true, nil case pbmStatusCanceled: - return true, errors.Errorf("%s was canceled", operation) + return true, fmt.Errorf("%s was canceled", operation) case pbmStatusError: if errMsg != "" { return true, errors.New(errMsg) } - return true, errors.Errorf("%s failed", operation) + return true, fmt.Errorf("%s failed", operation) case pbmStatusPartlyDone: if errMsg != "" { return true, errors.New(errMsg) } - return true, errors.Errorf("%s partly completed", operation) + return true, fmt.Errorf("%s partly completed", operation) default: return false, nil } @@ -561,7 +561,7 @@ func describeErr(info describeInfo, operation string) error { if err != nil && !errors.Is(err, errPBMOperationFailed) { return err } - return errors.Errorf("%s failed", operation) + return fmt.Errorf("%s failed", operation) } func findPITRRestore(list []pbmListRestore, restoreInfoPITRTime int64, startedAt time.Time) *pbmListRestore { @@ -604,7 +604,7 @@ func findPITRRestoreName(ctx context.Context, dsn string, restoreInfo *pbmRestor var list []pbmListRestore err = execPBMCommand(ctx, dsn, &list, "list", "--restore") if err != nil { - return false, errors.Wrapf(err, "pbm status error") + return false, fmt.Errorf("pbm status error: %w", err) } entry := findPITRRestore(list, restoreInfoPITRTime.Unix(), restoreInfo.StartedAt) if entry != nil { @@ -612,7 +612,7 @@ func findPITRRestoreName(ctx context.Context, dsn string, restoreInfo *pbmRestor return true, nil } if checks > maxRestoreChecks { - return false, errors.Errorf("failed to start restore") + return false, fmt.Errorf("failed to start restore") } return false, nil }) From 8aace639a9451d55a1fcc4228b9c192205de0144 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Mon, 22 Jun 2026 11:23:57 +0200 Subject: [PATCH 13/16] PMM-14576 Lint. --- agent/runner/jobs/pbm_helpers.go | 10 +++++----- agent/utils/poll/poll.go | 4 ++-- agent/utils/poll/poll_test.go | 12 ++++++------ 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/agent/runner/jobs/pbm_helpers.go b/agent/runner/jobs/pbm_helpers.go index 92d628abf72..4cd9ac9e8bf 100644 --- a/agent/runner/jobs/pbm_helpers.go +++ b/agent/runner/jobs/pbm_helpers.go @@ -234,7 +234,7 @@ func retrieveLogs(ctx context.Context, dsn string, event string) ([]pbmLogEntry, func waitForPBMNoRunningOperations(ctx context.Context, l logrus.FieldLogger, dsn string) error { l.Info("Waiting for no running pbm operations.") - return poll.PollUntilContextTimeout(ctx, statusCheckInterval, func(ctx context.Context) (bool, error) { + return poll.UntilContextTimeout(ctx, statusCheckInterval, func(ctx context.Context) (bool, error) { status, err := getPBMStatus(ctx, dsn) if err != nil { return false, err @@ -312,7 +312,7 @@ func waitForPBMBackup(ctx context.Context, l logrus.FieldLogger, dsn string, nam } func waitDescribe(ctx context.Context, cfg *describePoller) error { - return poll.PollUntilContextTimeout(ctx, cfg.interval(), func(ctx context.Context) (bool, error) { + return poll.UntilContextTimeout(ctx, cfg.interval(), func(ctx context.Context) (bool, error) { err := ctx.Err() if err != nil { return false, err @@ -594,7 +594,7 @@ func findPITRRestoreName(ctx context.Context, dsn string, restoreInfo *pbmRestor var name string checks := 0 - err = poll.PollUntilContextTimeout(ctx, statusCheckInterval, func(ctx context.Context) (bool, error) { + err = poll.UntilContextTimeout(ctx, statusCheckInterval, func(ctx context.Context) (bool, error) { err = ctx.Err() if err != nil { return false, err @@ -612,7 +612,7 @@ func findPITRRestoreName(ctx context.Context, dsn string, restoreInfo *pbmRestor return true, nil } if checks > maxRestoreChecks { - return false, fmt.Errorf("failed to start restore") + return false, errors.New("failed to start restore") } return false, nil }) @@ -832,7 +832,7 @@ func getSnapshots(ctx context.Context, l logrus.FieldLogger, dsn string) ([]pbmS // Sometimes PBM returns empty list of snapshots, that's why we're trying to get them several times. var snapshots []pbmSnapshot checks := 0 - err := poll.PollUntilContextTimeout(ctx, listCheckInterval, func(ctx context.Context) (bool, error) { + err := poll.UntilContextTimeout(ctx, listCheckInterval, func(ctx context.Context) (bool, error) { checks++ status, err := getPBMStatus(ctx, dsn) if err != nil { diff --git a/agent/utils/poll/poll.go b/agent/utils/poll/poll.go index 5fc17cab3ee..8bcf81c98b4 100644 --- a/agent/utils/poll/poll.go +++ b/agent/utils/poll/poll.go @@ -23,8 +23,8 @@ import ( // ConditionFunc returns true when the condition is successfully met. type ConditionFunc func(ctx context.Context) (done bool, err error) -// PollUntilContextTimeout polls until condition returns done=true, err!=nil, or ctx is canceled. -func PollUntilContextTimeout(ctx context.Context, interval time.Duration, condition ConditionFunc) error { +// UntilContextTimeout polls until condition returns done=true, err!=nil, or ctx is canceled. +func UntilContextTimeout(ctx context.Context, interval time.Duration, condition ConditionFunc) error { done, err := condition(ctx) if err != nil { return err diff --git a/agent/utils/poll/poll_test.go b/agent/utils/poll/poll_test.go index 8c314bc1673..4fc414d30ed 100644 --- a/agent/utils/poll/poll_test.go +++ b/agent/utils/poll/poll_test.go @@ -24,14 +24,14 @@ import ( "github.com/stretchr/testify/require" ) -func TestPollUntilContextTimeout(t *testing.T) { +func TestUntilContextTimeout(t *testing.T) { t.Parallel() t.Run("immediate success", func(t *testing.T) { t.Parallel() calls := 0 - err := PollUntilContextTimeout(t.Context(), time.Millisecond, func(context.Context) (bool, error) { + err := UntilContextTimeout(t.Context(), time.Millisecond, func(context.Context) (bool, error) { calls++ return true, nil }) @@ -43,7 +43,7 @@ func TestPollUntilContextTimeout(t *testing.T) { t.Parallel() calls := 0 - err := PollUntilContextTimeout(t.Context(), time.Millisecond, func(context.Context) (bool, error) { + err := UntilContextTimeout(t.Context(), time.Millisecond, func(context.Context) (bool, error) { calls++ return calls == 3, nil }) @@ -55,7 +55,7 @@ func TestPollUntilContextTimeout(t *testing.T) { t.Parallel() expected := errors.New("boom") - err := PollUntilContextTimeout(t.Context(), time.Millisecond, func(context.Context) (bool, error) { + err := UntilContextTimeout(t.Context(), time.Millisecond, func(context.Context) (bool, error) { return false, expected }) assert.ErrorIs(t, err, expected) @@ -67,7 +67,7 @@ func TestPollUntilContextTimeout(t *testing.T) { ctx, cancel := context.WithCancel(t.Context()) cancel() - err := PollUntilContextTimeout(ctx, time.Millisecond, func(context.Context) (bool, error) { + err := UntilContextTimeout(ctx, time.Millisecond, func(context.Context) (bool, error) { return false, nil }) assert.ErrorIs(t, err, context.Canceled) @@ -79,7 +79,7 @@ func TestPollUntilContextTimeout(t *testing.T) { ctx, cancel := context.WithTimeout(t.Context(), 20*time.Millisecond) defer cancel() - err := PollUntilContextTimeout(ctx, 5*time.Millisecond, func(context.Context) (bool, error) { + err := UntilContextTimeout(ctx, 5*time.Millisecond, func(context.Context) (bool, error) { return false, nil }) assert.ErrorIs(t, err, context.DeadlineExceeded) From dccff1478325e8a4b11506a2956b579784e3ac64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Mon, 22 Jun 2026 12:08:37 +0200 Subject: [PATCH 14/16] PMM-14576 Another refactor. --- agent/runner/jobs/mongodb_restore_job.go | 51 +++++++++++++----------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/agent/runner/jobs/mongodb_restore_job.go b/agent/runner/jobs/mongodb_restore_job.go index 5597fa04882..750036ce343 100644 --- a/agent/runner/jobs/mongodb_restore_job.go +++ b/agent/runner/jobs/mongodb_restore_job.go @@ -27,6 +27,7 @@ import ( "github.com/sirupsen/logrus" "google.golang.org/protobuf/types/known/timestamppb" + "github.com/percona/pmm/agent/utils/poll" agentv1 "github.com/percona/pmm/api/agent/v1" ) @@ -211,36 +212,38 @@ func (j *MongoDBRestoreJob) startRestore(ctx context.Context, backupName string) j.l.Infof("starting backup restore for: %s.", backupName) var restoreOutput pbmRestore - var err error startTime := time.Now() - - ticker := time.NewTicker(statusCheckInterval) - defer ticker.Stop() retryCount := 500 + started := false - for { - select { - case <-ticker.C: + err := poll.UntilContextTimeout(ctx, statusCheckInterval, func(ctx context.Context) (bool, error) { + // Preserve previous behavior: first restore command runs after the first tick. + if !started { + started = true + return false, nil + } - if j.pitrTimestamp.Unix() == 0 { - err = execPBMCommand(ctx, j.dbURL, &restoreOutput, "restore", backupName) - } else { - err = execPBMCommand(ctx, j.dbURL, &restoreOutput, "restore", "--time="+j.pitrTimestamp.Format("2006-01-02T15:04:05")) - } + var cmdErr error + if j.pitrTimestamp.Unix() == 0 { + cmdErr = execPBMCommand(ctx, j.dbURL, &restoreOutput, "restore", backupName) + } else { + cmdErr = execPBMCommand(ctx, j.dbURL, &restoreOutput, "restore", "--time="+j.pitrTimestamp.Format("2006-01-02T15:04:05")) + } - if err != nil { - if strings.HasSuffix(err.Error(), "another operation in progress") && retryCount > 0 { - retryCount-- - continue - } - return nil, fmt.Errorf("pbm restore error: %w", err) + if cmdErr != nil { + if strings.HasSuffix(cmdErr.Error(), "another operation in progress") && retryCount > 0 { + retryCount-- + return false, nil } - - restoreOutput.StartedAt = startTime - return &restoreOutput, nil - - case <-ctx.Done(): - return nil, ctx.Err() + return false, fmt.Errorf("pbm restore error: %w", cmdErr) } + + restoreOutput.StartedAt = startTime + return true, nil + }) + if err != nil { + return nil, err } + + return &restoreOutput, nil } From 728c61a45550458a55d5a44f44c10861f20c6863 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Tue, 23 Jun 2026 09:53:26 +0200 Subject: [PATCH 15/16] PMM-14576 Improvements. --- agent/utils/poll/poll.go | 13 +++++++++++++ agent/utils/poll/poll_test.go | 16 ++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/agent/utils/poll/poll.go b/agent/utils/poll/poll.go index 8bcf81c98b4..eef7147b46f 100644 --- a/agent/utils/poll/poll.go +++ b/agent/utils/poll/poll.go @@ -17,6 +17,7 @@ package poll import ( "context" + "fmt" "time" ) @@ -25,6 +26,14 @@ type ConditionFunc func(ctx context.Context) (done bool, err error) // UntilContextTimeout polls until condition returns done=true, err!=nil, or ctx is canceled. func UntilContextTimeout(ctx context.Context, interval time.Duration, condition ConditionFunc) error { + if interval <= 0 { + return fmt.Errorf("interval must be positive: %s", interval) + } + + if err := ctx.Err(); err != nil { + return err + } + done, err := condition(ctx) if err != nil { return err @@ -41,6 +50,10 @@ func UntilContextTimeout(ctx context.Context, interval time.Duration, condition case <-ctx.Done(): return ctx.Err() case <-ticker.C: + if err := ctx.Err(); err != nil { + return err + } + done, err := condition(ctx) if err != nil { return err diff --git a/agent/utils/poll/poll_test.go b/agent/utils/poll/poll_test.go index 4fc414d30ed..51e8178456f 100644 --- a/agent/utils/poll/poll_test.go +++ b/agent/utils/poll/poll_test.go @@ -67,10 +67,13 @@ func TestUntilContextTimeout(t *testing.T) { ctx, cancel := context.WithCancel(t.Context()) cancel() + calls := 0 err := UntilContextTimeout(ctx, time.Millisecond, func(context.Context) (bool, error) { + calls++ return false, nil }) assert.ErrorIs(t, err, context.Canceled) + assert.Equal(t, 0, calls) }) t.Run("context timeout", func(t *testing.T) { @@ -84,4 +87,17 @@ func TestUntilContextTimeout(t *testing.T) { }) assert.ErrorIs(t, err, context.DeadlineExceeded) }) + + t.Run("invalid interval", func(t *testing.T) { + t.Parallel() + + calls := 0 + err := UntilContextTimeout(t.Context(), 0, func(context.Context) (bool, error) { + calls++ + return true, nil + }) + require.Error(t, err) + assert.ErrorContains(t, err, "interval must be positive") + assert.Equal(t, 0, calls) + }) } From 491925b225e3771550d2b7940e6d018c4b5654da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C4=8Ctvrtka?= Date: Tue, 23 Jun 2026 10:03:27 +0200 Subject: [PATCH 16/16] PMM-14576 Lint. --- agent/utils/poll/poll.go | 6 ++++-- agent/utils/poll/poll_test.go | 8 ++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/agent/utils/poll/poll.go b/agent/utils/poll/poll.go index eef7147b46f..00d370be389 100644 --- a/agent/utils/poll/poll.go +++ b/agent/utils/poll/poll.go @@ -30,7 +30,8 @@ func UntilContextTimeout(ctx context.Context, interval time.Duration, condition return fmt.Errorf("interval must be positive: %s", interval) } - if err := ctx.Err(); err != nil { + err := ctx.Err() + if err != nil { return err } @@ -50,7 +51,8 @@ func UntilContextTimeout(ctx context.Context, interval time.Duration, condition case <-ctx.Done(): return ctx.Err() case <-ticker.C: - if err := ctx.Err(); err != nil { + err = ctx.Err() + if err != nil { return err } diff --git a/agent/utils/poll/poll_test.go b/agent/utils/poll/poll_test.go index 51e8178456f..cd4984fcdc8 100644 --- a/agent/utils/poll/poll_test.go +++ b/agent/utils/poll/poll_test.go @@ -58,7 +58,7 @@ func TestUntilContextTimeout(t *testing.T) { err := UntilContextTimeout(t.Context(), time.Millisecond, func(context.Context) (bool, error) { return false, expected }) - assert.ErrorIs(t, err, expected) + require.ErrorIs(t, err, expected) }) t.Run("context canceled", func(t *testing.T) { @@ -72,7 +72,7 @@ func TestUntilContextTimeout(t *testing.T) { calls++ return false, nil }) - assert.ErrorIs(t, err, context.Canceled) + require.ErrorIs(t, err, context.Canceled) assert.Equal(t, 0, calls) }) @@ -85,7 +85,7 @@ func TestUntilContextTimeout(t *testing.T) { err := UntilContextTimeout(ctx, 5*time.Millisecond, func(context.Context) (bool, error) { return false, nil }) - assert.ErrorIs(t, err, context.DeadlineExceeded) + require.ErrorIs(t, err, context.DeadlineExceeded) }) t.Run("invalid interval", func(t *testing.T) { @@ -97,7 +97,7 @@ func TestUntilContextTimeout(t *testing.T) { return true, nil }) require.Error(t, err) - assert.ErrorContains(t, err, "interval must be positive") + require.ErrorContains(t, err, "interval must be positive") assert.Equal(t, 0, calls) }) }