Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ package omni
import (
"context"
"errors"
"log"

"github.com/cosi-project/runtime/pkg/controller"
"github.com/cosi-project/runtime/pkg/controller/generic/qtransform"
Expand Down Expand Up @@ -44,6 +45,7 @@ func NewClusterMachineStatusController() *ClusterMachineStatusController {
return omni.NewClusterMachine(clusterMachineStatus.Metadata().ID())
},
TransformFunc: func(ctx context.Context, r controller.Reader, _ *zap.Logger, clusterMachine *omni.ClusterMachine, clusterMachineStatus *omni.ClusterMachineStatus) error {
log.Printf("[CMS-DEBUG] TransformFunc called for ClusterMachine %s", clusterMachine.Metadata().ID())
machine, err := safe.ReaderGet[*omni.Machine](ctx, r, resource.NewMetadata(resources.DefaultNamespace, omni.MachineType, clusterMachine.Metadata().ID(), resource.VersionUndefined))
if err != nil && !state.IsNotFoundError(err) {
return err
Expand Down Expand Up @@ -322,10 +324,13 @@ func updateMachineProvisionStatus(ctx context.Context, r controller.Reader, mach
}

if machineRequestStatus == nil {
log.Printf("[CMS-DEBUG] updateMachineProvisionStatus: MachineRequestStatus %q not found, providerID will be empty", machineRequestID)

return nil
}

cmsVal.ProvisionStatus.ProviderId, ok = machineRequestStatus.Metadata().Labels().Get(omni.LabelInfraProviderID)
log.Printf("[CMS-DEBUG] updateMachineProvisionStatus: MachineRequestStatus %q found, providerID=%q", machineRequestID, cmsVal.ProvisionStatus.ProviderId)

if !ok {
return nil
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ package omni_test

import (
"context"
"fmt"
"log"
"runtime"
"testing"
"time"

Expand All @@ -15,11 +18,14 @@ import (
"github.com/siderolabs/go-retry/retry"
machineapi "github.com/siderolabs/talos/pkg/machinery/api/machine"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/stretchr/testify/suite"

"github.com/siderolabs/omni/client/api/omni/specs"
"github.com/siderolabs/omni/client/pkg/omni/resources/infra"
"github.com/siderolabs/omni/client/pkg/omni/resources/omni"
omnictrl "github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/omni"
"github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/testutils"
"github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/testutils/rmock"
"github.com/siderolabs/omni/internal/backend/runtime/omni/controllers/testutils/rmock/options"
)
Expand Down Expand Up @@ -246,3 +252,93 @@ func TestClusterMachineStatusSuite(t *testing.T) {

suite.Run(t, new(ClusterMachineStatusSuite))
}

// TestMachineRequestStatusDestroyMapping reproduces the mapped input destruction race:
// when MachineRequestStatus is destroyed, MapperFuncFromTyped does a Get that returns not-found,
// so the mapper returns nil and the ClusterMachine is never re-reconciled.
//
// Run with: go test -v -run TestClusterMachineStatusSuite/TestMachineRequestStatusDestroyMapping -count=1000 -failfast ./internal/backend/runtime/omni/controllers/omni
func TestMachineRequestStatusDestroyMapping(t *testing.T) {
t.Parallel()

ctx, cancel := context.WithTimeout(t.Context(), 30*time.Second)
defer cancel()

testutils.WithRuntime(ctx, t, testutils.TestOptions{},
func(_ context.Context, tc testutils.TestContext) {
require.NoError(t, tc.Runtime.RegisterQController(omnictrl.NewClusterMachineStatusController()))
},
func(ctx context.Context, tc testutils.TestContext) {
const machineID = "machine-uuid-1"
const requestID = "request-1"
const providerID = "test-provider"

st := tc.State

cluster := rmock.Mock[*omni.Cluster](ctx, t, st)

rmock.Mock[*omni.MachineSetNode](ctx, t, st,
options.WithID(machineID),
options.LabelCluster(cluster),
options.EmptyLabel(omni.LabelWorkerRole),
)

rmock.Mock[*omni.ClusterMachine](ctx, t, st, options.WithID(machineID))

machine := omni.NewMachine(machineID)
machine.TypedSpec().Value.Connected = true
require.NoError(t, st.Create(ctx, machine))

machineStatus := omni.NewMachineStatus(machineID)
machineStatus.Metadata().Labels().Set(omni.LabelMachineRequest, requestID)
require.NoError(t, st.Create(ctx, machineStatus))

statusSnapshot := omni.NewMachineStatusSnapshot(machineID)
statusSnapshot.TypedSpec().Value.MachineStatus = &machineapi.MachineStatusEvent{
Stage: machineapi.MachineStatusEvent_RUNNING,
Status: &machineapi.MachineStatusEvent_MachineStatus{Ready: true},
}
require.NoError(t, st.Create(ctx, statusSnapshot))

mrs := infra.NewMachineRequestStatus(requestID)
mrs.TypedSpec().Value.Id = machineID
mrs.Metadata().Labels().Set(omni.LabelInfraProviderID, providerID)
require.NoError(t, st.Create(ctx, mrs))

// Wait for providerID to be populated.
rtestutils.AssertResource(ctx, t, st, machineID,
func(status *omni.ClusterMachineStatus, assertions *assert.Assertions) {
assertions.Equal(providerID, status.TypedSpec().Value.GetProvisionStatus().GetProviderId())
},
)

// Flood queue with mapped input events.
for i := range 100 {
ss := omni.NewMachineStatusSnapshot(fmt.Sprintf("pressure-%d", i))
ss.TypedSpec().Value.MachineStatus = &machineapi.MachineStatusEvent{}
require.NoError(t, st.Create(ctx, ss))
}

runtime.Gosched()

// Teardown+Destroy while queue is backlogged.
_, err := st.Teardown(ctx, mrs.Metadata())
require.NoError(t, err)
require.NoError(t, st.Destroy(ctx, mrs.Metadata()))

// If the mapper lost the event, providerID stays stale and this times out.
checkCtx, checkCancel := context.WithTimeout(ctx, 5*time.Second)
defer checkCancel()

rtestutils.AssertResource(checkCtx, t, st, machineID,
func(status *omni.ClusterMachineStatus, assertions *assert.Assertions) {
got := status.TypedSpec().Value.GetProvisionStatus().GetProviderId()
log.Printf("[TEST] providerID after destroy: %q", got)

assertions.Empty(got,
"BUG: providerID stale (%q) - mapper failed to trigger reconciliation", got)
},
)
},
)
}
Loading