Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions cmd/bb_runner/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,13 @@ func main() {
commandCreator,
configuration.SetTmpdirEnvironmentVariable,
)
if configuration.AssumeExclusiveCgroup {
var err error
r, err = runner.NewCgroupResourceUsageSamplingRunner(r)
if err != nil {
return util.StatusWrap(err, "Failed to enable cgroup resource usage sampling")
}
}

// Let bb_runner replace temporary directories with symbolic
// links pointing to the temporary directory set up by
Expand Down
28 changes: 28 additions & 0 deletions pkg/builder/local_build_executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/buildbarn/bb-remote-execution/pkg/filesystem/access"
"github.com/buildbarn/bb-remote-execution/pkg/filesystem/pool"
"github.com/buildbarn/bb-remote-execution/pkg/proto/remoteworker"
"github.com/buildbarn/bb-remote-execution/pkg/proto/resourceusage"
runner_pb "github.com/buildbarn/bb-remote-execution/pkg/proto/runner"
"github.com/buildbarn/bb-storage/pkg/blobstore"
"github.com/buildbarn/bb-storage/pkg/clock"
Expand All @@ -36,6 +37,19 @@ var (
checkReadinessComponent = path.MustNewComponent("check_readiness")
)

func getCgroupResourceUsage(result *remoteexecution.ActionResult) *resourceusage.CgroupResourceUsage {
if result == nil || result.ExecutionMetadata == nil {
return nil
}
var cgroupUsage resourceusage.CgroupResourceUsage
for _, metadata := range result.ExecutionMetadata.AuxiliaryMetadata {
if metadata.UnmarshalTo(&cgroupUsage) == nil {
return &cgroupUsage
}
}
return nil
}

// capturingErrorLogger is an error logger that stores up to a single
// error. When the error is stored, a context cancelation function is
// invoked. This is used by localBuildExecutor to kill a build action in
Expand Down Expand Up @@ -309,6 +323,20 @@ func (be *localBuildExecutor) Execute(ctx context.Context, filePool pool.FilePoo
if runErr == nil {
response.Result.ExitCode = int32(runResponse.ExitCode)
response.Result.ExecutionMetadata.AuxiliaryMetadata = append(response.Result.ExecutionMetadata.AuxiliaryMetadata, runResponse.ResourceUsage...)
if cgroupUsage := getCgroupResourceUsage(response.Result); cgroupUsage != nil && cgroupUsage.MemoryEventsOomKill > 0 {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd rather not make changes along these lines, as bb_worker should remain oblivious of runtime details like groups. Can't bb_runner just return the correct values?

Also, why do we need this in the first place? If something gets OOM killed, shouldn't that already lead to non-zero exit codes? And even if it returns an exit code of zero, that might be intentional? (For example, you may want to write an integration test that ensures that your software does the right thing when OOM killed?)

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, so this code does three things:

  1. For an action that experienced an OOM-kill, force it to fail with ExitCode=1
    • Agree with your reasoning that this might be intentional, we should just remove this
  2. For an action that experienced an OOM-kill and hit its cgroup limit, we surface this in response.Message
    • This is only really for UX convenience, I was thinking bb-browser could show the message next to the nonzero exit code to make it more obvious to the users why the action (likely) failed. But the CgroupResourceUsage is attached as response metadata, so the information is also available there. Would you rather not convey the OOM info in Message?
  3. For an action that experienced an OOM-kill but did not hit its cgroup memory limit, fail it with Unavailable.
    • This would make it safer to run bb_runner in a memory-overcommitted environment and singleProcessOOMKill; currently such a setup can result in intermittent action failures. So I think this would still be desired, don't you think?

@scele scele Jun 10, 2026

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If (and only if) you agree that the third point is desireable, that alone could easily be done in CgroupResourceUsageSamplingRunner. Then we wouldn't need any worker changes. WDYT?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The latest commits remove ExitCode override, remove Message override, and move the "return Unavailable if the action was OOMkilled but the cgroup didn't OOM (since that's an infra issue that should't be attributed to the action)" logic to the runner. Let me know if you still want this to be changed.

if cgroupUsage.MemoryEventsOom > 0 {
response.Message = "Action failed due to out of memory: cgroup memory limit was reached and a process was OOM-killed"
if response.Result.ExitCode == 0 {
response.Result.ExitCode = 1
}
} else {
// The cgroup did not reach its memory limit, so the OOM
// kill likely came from system-level memory pressure, such
// as node memory overcommitment. Treat this as retryable
// infrastructure failure.
attachErrorToExecuteResponse(response, status.Error(codes.Unavailable, "An action process was OOM-killed without the action reaching its cgroup memory limit"))
}
}
} else {
attachErrorToExecuteResponse(response, util.StatusWrap(runErr, "Failed to run command"))
}
Expand Down
146 changes: 146 additions & 0 deletions pkg/builder/local_build_executor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
re_clock "github.com/buildbarn/bb-remote-execution/pkg/clock"
"github.com/buildbarn/bb-remote-execution/pkg/filesystem/access"
"github.com/buildbarn/bb-remote-execution/pkg/proto/remoteworker"
"github.com/buildbarn/bb-remote-execution/pkg/proto/resourceusage"
runner_pb "github.com/buildbarn/bb-remote-execution/pkg/proto/runner"
"github.com/buildbarn/bb-storage/pkg/blobstore/buffer"
"github.com/buildbarn/bb-storage/pkg/digest"
Expand Down Expand Up @@ -794,6 +795,151 @@ func TestLocalBuildExecutorSuccess(t *testing.T) {
}, executeResponse)
}

func TestLocalBuildExecutorReportsCgroupOOMKill(t *testing.T) {
for _, testCase := range []struct {
name string
cgroupUsage *resourceusage.CgroupResourceUsage
wantExitCode int32
wantMessage string
wantStatus error
}{
{
name: "memory limit OOM kill with zero runner exit code",
cgroupUsage: &resourceusage.CgroupResourceUsage{
MemoryEventsOom: 1,
MemoryEventsOomKill: 1,
},
wantExitCode: 1,
wantMessage: "Action failed due to out of memory: cgroup memory limit was reached and a process was OOM-killed",
},
{
name: "external OOM kill with zero runner exit code",
cgroupUsage: &resourceusage.CgroupResourceUsage{
MemoryEventsOomKill: 1,
},
wantStatus: status.Error(codes.Unavailable, "An action process was OOM-killed without the action reaching its cgroup memory limit"),
},
} {
t.Run(testCase.name, func(t *testing.T) {
ctrl, ctx := gomock.WithContext(context.Background(), t)

resourceUsage, err := anypb.New(testCase.cgroupUsage)
require.NoError(t, err)

contentAddressableStorage := mock.NewMockBlobAccess(ctrl)
contentAddressableStorage.EXPECT().Get(
gomock.Any(),
digest.MustNewDigest("ubuntu1804", remoteexecution.DigestFunction_SHA256, "0000000000000000000000000000000000000000000000000000000000000002", 234),
).Return(buffer.NewProtoBufferFromProto(&remoteexecution.Command{
Arguments: []string{"clang"},
}, buffer.UserProvided))

buildDirectory := mock.NewMockBuildDirectory(ctrl)
buildDirectoryCreator := mock.NewMockBuildDirectoryCreator(ctrl)
actionDigest := digest.MustNewDigest("ubuntu1804", remoteexecution.DigestFunction_SHA256, "0000000000000000000000000000000000000000000000000000000000000001", 123)
buildDirectoryCreator.EXPECT().GetBuildDirectory(ctx, &actionDigest).
Return(buildDirectory, nil, nil)
filePool := mock.NewMockFilePool(ctrl)
monitor := mock.NewMockUnreadDirectoryMonitor(ctrl)
buildDirectory.EXPECT().InstallHooks(filePool, gomock.Any())
buildDirectory.EXPECT().Mkdir(path.MustNewComponent("root"), os.FileMode(0o777))
inputRootDirectory := mock.NewMockBuildDirectory(ctrl)
buildDirectory.EXPECT().EnterBuildDirectory(path.MustNewComponent("root")).Return(inputRootDirectory, nil)
inputRootDirectory.EXPECT().MergeDirectoryContents(
ctx,
gomock.Any(),
digest.MustNewDigest("ubuntu1804", remoteexecution.DigestFunction_SHA256, "0000000000000000000000000000000000000000000000000000000000000003", 345),
monitor,
).Return(nil)
buildDirectory.EXPECT().Mkdir(path.MustNewComponent("tmp"), os.FileMode(0o777))
buildDirectory.EXPECT().Mkdir(path.MustNewComponent("server_logs"), os.FileMode(0o777))

runner := mock.NewMockRunnerClient(ctrl)
runner.EXPECT().Run(gomock.Any(), &runner_pb.RunRequest{
Arguments: []string{"clang"},
EnvironmentVariables: map[string]string{},
WorkingDirectory: "",
StdoutPath: "stdout",
StderrPath: "stderr",
InputRootDirectory: "root",
TemporaryDirectory: "tmp",
ServerLogsDirectory: "server_logs",
}).Return(&runner_pb.RunResponse{
ExitCode: 0,
ResourceUsage: []*anypb.Any{resourceUsage},
}, nil)
inputRootDirectory.EXPECT().Close()
emptyDigest := digest.MustNewDigest("ubuntu1804", remoteexecution.DigestFunction_SHA256, "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", 0)
buildDirectory.EXPECT().UploadFile(ctx, path.MustNewComponent("stdout"), gomock.Any(), gomock.Any()).Return(emptyDigest, nil)
buildDirectory.EXPECT().UploadFile(ctx, path.MustNewComponent("stderr"), gomock.Any(), gomock.Any()).Return(emptyDigest, nil)
serverLogsDirectory := mock.NewMockUploadableDirectory(ctrl)
buildDirectory.EXPECT().EnterUploadableDirectory(path.MustNewComponent("server_logs")).Return(serverLogsDirectory, nil)
serverLogsDirectory.EXPECT().ReadDir()
serverLogsDirectory.EXPECT().Close()
buildDirectory.EXPECT().Close()

clock := mock.NewMockClock(ctrl)
clock.EXPECT().NewContextWithTimeout(gomock.Any(), time.Hour).DoAndReturn(func(parent context.Context, timeout time.Duration) (context.Context, context.CancelFunc) {
return context.WithCancel(parent)
})
clock.EXPECT().NewContextWithTimeout(gomock.Any(), 10*time.Second).DoAndReturn(func(parent context.Context, timeout time.Duration) (context.Context, context.CancelFunc) {
return parent, func() {}
})
localBuildExecutor := builder.NewLocalBuildExecutor(
contentAddressableStorage,
buildDirectoryCreator,
runner,
clock,
/* maximumWritableFileUploadDelay = */ 10*time.Second,
/* inputRootCharacterDevices = */ nil,
/* maximumMessageSizeBytes = */ 10000,
/* environmentVariables = */ map[string]string{},
/* forceUploadTreesAndDirectories = */ false,
)

metadata := make(chan *remoteworker.CurrentState_Executing, 10)
executeResponse := localBuildExecutor.Execute(
ctx,
filePool,
monitor,
digest.MustNewFunction("ubuntu1804", remoteexecution.DigestFunction_SHA256),
&remoteworker.DesiredState_Executing{
ActionDigest: &remoteexecution.Digest{
Hash: "0000000000000000000000000000000000000000000000000000000000000001",
SizeBytes: 123,
},
Action: &remoteexecution.Action{
CommandDigest: &remoteexecution.Digest{
Hash: "0000000000000000000000000000000000000000000000000000000000000002",
SizeBytes: 234,
},
InputRootDigest: &remoteexecution.Digest{
Hash: "0000000000000000000000000000000000000000000000000000000000000003",
SizeBytes: 345,
},
Timeout: &durationpb.Duration{Seconds: 3600},
},
},
metadata,
)

expectedResponse := &remoteexecution.ExecuteResponse{
Result: &remoteexecution.ActionResult{
ExitCode: testCase.wantExitCode,
ExecutionMetadata: &remoteexecution.ExecutedActionMetadata{
AuxiliaryMetadata: []*anypb.Any{resourceUsage},
},
},
Message: testCase.wantMessage,
}
if testCase.wantStatus != nil {
expectedResponse.Status = status.Convert(testCase.wantStatus).Proto()
}
testutil.RequireEqualProto(t, expectedResponse, executeResponse)
})
}
}

func TestLocalBuildExecutorCachingInvalidTimeout(t *testing.T) {
ctrl, ctx := gomock.WithContext(context.Background(), t)

Expand Down
13 changes: 11 additions & 2 deletions pkg/proto/configuration/bb_runner/bb_runner.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions pkg/proto/configuration/bb_runner/bb_runner.proto
Original file line number Diff line number Diff line change
Expand Up @@ -135,4 +135,13 @@ message ApplicationConfiguration {
// https://github.com/bazelbuild/bazel/blob/master/src/main/java/com/google/devtools/build/lib/exec/local/XcodeLocalEnvProvider.java
// https://www.smileykeith.com/2021/03/08/locking-xcode-in-bazel/
map<string, string> apple_xcode_developer_directories = 14;

// Sample cgroup v2 resource usage counters around each action.
//
// This should only be enabled when the bb_runner process has an exclusive
// cgroup for action execution. If multiple runners or actions share the
// cgroup, the sampled counter deltas include unrelated work and are
// misleading. When this is enabled, bb_runner fails actions with
// codes.Internal if it detects multiple actions running concurrently.
bool assume_exclusive_cgroup = 15;
}
Loading
Loading