Skip to content

Commit 73d8121

Browse files
committed
Add E2E test for reconcile on NRI restart
1 parent c3b1774 commit 73d8121

1 file changed

Lines changed: 263 additions & 0 deletions

File tree

Lines changed: 263 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,263 @@
1+
/*
2+
Copyright 2026 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package e2e
18+
19+
import (
20+
"context"
21+
"encoding/json"
22+
"fmt"
23+
"os"
24+
"time"
25+
26+
"github.com/kubernetes-sigs/dra-driver-cpu/test/pkg/discovery"
27+
"github.com/kubernetes-sigs/dra-driver-cpu/test/pkg/fixture"
28+
"github.com/kubernetes-sigs/dra-driver-cpu/test/pkg/node"
29+
e2epod "github.com/kubernetes-sigs/dra-driver-cpu/test/pkg/pod"
30+
"github.com/onsi/ginkgo/v2"
31+
"github.com/onsi/gomega"
32+
appsv1 "k8s.io/api/apps/v1"
33+
v1 "k8s.io/api/core/v1"
34+
resourcev1 "k8s.io/api/resource/v1"
35+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
36+
"k8s.io/utils/cpuset"
37+
)
38+
39+
const (
40+
daemonSetNamespaceRule = "kube-system"
41+
daemonSetLabelRule = "app=dracpu"
42+
pollIntervalRule = 2 * time.Second
43+
pollTimeoutRule = 2 * time.Minute
44+
)
45+
46+
var _ = ginkgo.Describe("NRI Reconciliation on Restart", ginkgo.Serial, ginkgo.Ordered, ginkgo.ContinueOnFailure, func() {
47+
var (
48+
rootFxt *fixture.Fixture
49+
targetNode *v1.Node
50+
targetNodeCPUInfo discovery.DRACPUInfo
51+
dracpuTesterImage string
52+
allocatableCPUs cpuset.CPUSet
53+
reservedCPUs cpuset.CPUSet
54+
cpuDeviceMode string
55+
orgDaemonSet *appsv1.DaemonSet
56+
)
57+
58+
ginkgo.BeforeAll(func(ctx context.Context) {
59+
// early cheap check before to create the Fixture, so we use GinkgoLogr directly
60+
dracpuTesterImage = os.Getenv("DRACPU_E2E_TEST_IMAGE")
61+
gomega.Expect(dracpuTesterImage).ToNot(gomega.BeEmpty(), "missing environment variable DRACPU_E2E_TEST_IMAGE")
62+
ginkgo.GinkgoLogr.Info("discovery image", "pullSpec", dracpuTesterImage)
63+
64+
var err error
65+
if reservedCPUVal := os.Getenv("DRACPU_E2E_RESERVED_CPUS"); len(reservedCPUVal) > 0 {
66+
reservedCPUs, err = cpuset.Parse(reservedCPUVal)
67+
gomega.Expect(err).ToNot(gomega.HaveOccurred())
68+
ginkgo.GinkgoLogr.Info("reserved CPUs", "value", reservedCPUs.String())
69+
}
70+
71+
rootFxt, err = fixture.ForGinkgo()
72+
gomega.Expect(err).ToNot(gomega.HaveOccurred(), "cannot create fixture")
73+
infraFxt := rootFxt.WithPrefix("infra")
74+
gomega.Expect(infraFxt.Setup(ctx)).To(gomega.Succeed())
75+
ginkgo.DeferCleanup(infraFxt.Teardown)
76+
77+
ginkgo.By("getting the daemonset configuration")
78+
orgDaemonSet, err = rootFxt.K8SClientset.AppsV1().DaemonSets(daemonSetNamespaceRule).Get(ctx, "dracpu", metav1.GetOptions{})
79+
gomega.Expect(err).ToNot(gomega.HaveOccurred(), "cannot get dracpu daemonset")
80+
gomega.Expect(orgDaemonSet.Spec.Template.Spec.Containers).ToNot(gomega.BeEmpty(), "no containers in dracpu daemonset")
81+
82+
for _, arg := range orgDaemonSet.Spec.Template.Spec.Containers[0].Args {
83+
if val, ok := parseCPUDeviceModeArg(arg); ok {
84+
cpuDeviceMode = val
85+
}
86+
}
87+
88+
// Find target node
89+
if targetNodeName := os.Getenv("DRACPU_E2E_TARGET_NODE"); len(targetNodeName) > 0 {
90+
targetNode, err = rootFxt.K8SClientset.CoreV1().Nodes().Get(ctx, targetNodeName, metav1.GetOptions{})
91+
gomega.Expect(err).ToNot(gomega.HaveOccurred())
92+
} else {
93+
gomega.Eventually(func() error {
94+
workerNodes, err := node.FindWorkers(ctx, infraFxt.K8SClientset)
95+
if err != nil {
96+
return err
97+
}
98+
if len(workerNodes) == 0 {
99+
return fmt.Errorf("no worker nodes detected")
100+
}
101+
targetNode = workerNodes[0]
102+
return nil
103+
}).WithTimeout(1 * time.Minute).WithPolling(5 * time.Second).Should(gomega.Succeed())
104+
}
105+
106+
// Discover topology
107+
infoPod := discovery.MakePod(infraFxt.Namespace.Name, dracpuTesterImage)
108+
infoPod = e2epod.PinToNode(infoPod, targetNode.Name)
109+
infoPod, err = e2epod.RunToCompletion(ctx, infraFxt.K8SClientset, infoPod)
110+
gomega.Expect(err).ToNot(gomega.HaveOccurred())
111+
data, err := e2epod.GetLogs(infraFxt.K8SClientset, ctx, infoPod.Namespace, infoPod.Name, infoPod.Spec.Containers[0].Name)
112+
gomega.Expect(err).ToNot(gomega.HaveOccurred())
113+
gomega.Expect(json.Unmarshal([]byte(data), &targetNodeCPUInfo)).To(gomega.Succeed())
114+
allocatableCPUs = makeCPUSetFromDiscoveredCPUInfo(targetNodeCPUInfo)
115+
})
116+
117+
ginkgo.It("should recover shared pool mask and preserve exclusive mask after restart", func(ctx context.Context) {
118+
fxt := rootFxt.WithPrefix("reconciliation")
119+
gomega.Expect(fxt.Setup(ctx)).To(gomega.Succeed())
120+
ginkgo.DeferCleanup(fxt.Teardown)
121+
122+
ginkgo.By("Creating Pod 1 with exclusive CPUs")
123+
claimTemplate := resourcev1.ResourceClaimTemplate{
124+
ObjectMeta: metav1.ObjectMeta{
125+
Name: "cpu-claim-reconcile-exclusive",
126+
},
127+
Spec: resourcev1.ResourceClaimTemplateSpec{
128+
Spec: makeResourceClaimSpec(2, cpuDeviceMode == "grouped"),
129+
},
130+
}
131+
createdClaimTemplate, err := fxt.K8SClientset.ResourceV1().ResourceClaimTemplates(fxt.Namespace.Name).Create(ctx, &claimTemplate, metav1.CreateOptions{})
132+
gomega.Expect(err).ToNot(gomega.HaveOccurred())
133+
134+
pod1 := makeTesterPodWithExclusiveCPUClaim(fxt.Namespace.Name, dracpuTesterImage, createdClaimTemplate.Name, 2, targetNode.Name)
135+
createdPod1, err := e2epod.CreateSync(ctx, fxt.K8SClientset, pod1)
136+
gomega.Expect(err).ToNot(gomega.HaveOccurred())
137+
138+
ginkgo.By("Verifying Pod 1 has correct exclusive CPU mask")
139+
alloc1 := getTesterPodCPUAllocation(fxt.K8SClientset, ctx, createdPod1)
140+
fxt.Log.Info("Pod 1 CPU allocation", "cpuAssigned", alloc1.CPUAssigned.String())
141+
gomega.Expect(alloc1.CPUAssigned.Size()).To(gomega.Equal(2), "Pod 1 did not get exclusive CPUs")
142+
exclusiveCPUs := alloc1.CPUAssigned
143+
144+
ginkgo.By("Stopping cpu dra driver pod on target node")
145+
// Defer restoration of DaemonSet
146+
ginkgo.DeferCleanup(func(ctx context.Context) {
147+
ginkgo.By("Restoring NRI plugin DaemonSet")
148+
ds, err := rootFxt.K8SClientset.AppsV1().DaemonSets(daemonSetNamespaceRule).Get(ctx, "dracpu", metav1.GetOptions{})
149+
if err != nil {
150+
return
151+
}
152+
ds.Spec = orgDaemonSet.Spec
153+
_, err = rootFxt.K8SClientset.AppsV1().DaemonSets(daemonSetNamespaceRule).Update(ctx, ds, metav1.UpdateOptions{})
154+
gomega.Expect(err).ToNot(gomega.HaveOccurred())
155+
})
156+
157+
gomega.Eventually(func(g gomega.Gomega) {
158+
// Modify DaemonSet to exclude target node
159+
ds := orgDaemonSet.DeepCopy()
160+
req := v1.NodeSelectorRequirement{
161+
Key: "kubernetes.io/hostname",
162+
Operator: v1.NodeSelectorOpNotIn,
163+
Values: []string{targetNode.Name},
164+
}
165+
if ds.Spec.Template.Spec.Affinity == nil {
166+
ds.Spec.Template.Spec.Affinity = &v1.Affinity{}
167+
}
168+
if ds.Spec.Template.Spec.Affinity.NodeAffinity == nil {
169+
ds.Spec.Template.Spec.Affinity.NodeAffinity = &v1.NodeAffinity{}
170+
}
171+
if ds.Spec.Template.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil {
172+
ds.Spec.Template.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{}
173+
}
174+
terms := ds.Spec.Template.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms
175+
if len(terms) == 0 {
176+
ds.Spec.Template.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms = []v1.NodeSelectorTerm{
177+
{
178+
MatchExpressions: []v1.NodeSelectorRequirement{req},
179+
},
180+
}
181+
} else {
182+
for i := range terms {
183+
terms[i].MatchExpressions = append(terms[i].MatchExpressions, req)
184+
}
185+
}
186+
_, err = rootFxt.K8SClientset.AppsV1().DaemonSets(daemonSetNamespaceRule).Update(ctx, ds, metav1.UpdateOptions{})
187+
gomega.Expect(err).ToNot(gomega.HaveOccurred())
188+
}, pollTimeoutRule, pollIntervalRule).Should(gomega.Succeed(), "failed to update DaemonSet affinity")
189+
190+
ginkgo.By("Waiting for cpu dra driver pod to terminate on target node")
191+
gomega.Eventually(func(g gomega.Gomega) {
192+
pods, err := listDriverPods(ctx, rootFxt.K8SClientset)
193+
g.Expect(err).NotTo(gomega.HaveOccurred())
194+
195+
terminated := true
196+
for _, p := range pods {
197+
if p.Spec.NodeName == targetNode.Name && p.Status.Phase != v1.PodFailed && p.Status.Phase != v1.PodSucceeded {
198+
terminated = false
199+
break
200+
}
201+
}
202+
g.Expect(terminated).To(gomega.BeTrue(), "Pod on target node is still running")
203+
}, pollTimeoutRule, pollIntervalRule).Should(gomega.Succeed(), "timed out waiting for pod to terminate")
204+
205+
ginkgo.By("Verifying Pod 1 still has correct exclusive CPU mask while NRI is down")
206+
alloc1Down := getTesterPodCPUAllocation(fxt.K8SClientset, ctx, createdPod1)
207+
gomega.Expect(alloc1Down.CPUAssigned).To(gomega.Equal(exclusiveCPUs), "Pod 1 CPU mask changed after NRI stopped")
208+
209+
ginkgo.By("Creating Pod 2 (Best Effort) on target node")
210+
pod2 := makeTesterPodBestEffort(fxt.Namespace.Name, dracpuTesterImage)
211+
pod2 = e2epod.PinToNode(pod2, targetNode.Name)
212+
createdPod2, err := e2epod.CreateSync(ctx, fxt.K8SClientset, pod2)
213+
gomega.Expect(err).ToNot(gomega.HaveOccurred())
214+
215+
ginkgo.By("Verifying Pod 2 CPU mask is NOT restricted to shared pool (should be default/all)")
216+
alloc2 := getTesterPodCPUAllocation(fxt.K8SClientset, ctx, createdPod2)
217+
fxt.Log.Info("Pod 2 CPU allocation (without NRI)", "cpuAssigned", alloc2.CPUAssigned.String())
218+
// Since NRI is down, pod2 is not restricted to shared pool CPUs and gets all online CPUs.
219+
gomega.Expect(alloc2.CPUAffinity).To(gomega.Equal(allocatableCPUs), "Pod 2 CPU mask not equal to all CPUs")
220+
221+
ginkgo.By("Bringing up the cpu dra driver on target node")
222+
// Restore original DaemonSet
223+
ds, err := rootFxt.K8SClientset.AppsV1().DaemonSets(daemonSetNamespaceRule).Get(ctx, "dracpu", metav1.GetOptions{})
224+
gomega.Expect(err).ToNot(gomega.HaveOccurred())
225+
ds.Spec = orgDaemonSet.Spec
226+
_, err = rootFxt.K8SClientset.AppsV1().DaemonSets(daemonSetNamespaceRule).Update(ctx, ds, metav1.UpdateOptions{})
227+
gomega.Expect(err).ToNot(gomega.HaveOccurred())
228+
229+
ginkgo.By("Waiting for NRI plugin pod to become ready on target node")
230+
gomega.Eventually(func(g gomega.Gomega) {
231+
pods, err := listDriverPods(ctx, rootFxt.K8SClientset)
232+
g.Expect(err).NotTo(gomega.HaveOccurred())
233+
234+
ready := false
235+
for _, p := range pods {
236+
if p.Spec.NodeName == targetNode.Name {
237+
for _, cs := range p.Status.ContainerStatuses {
238+
if cs.Ready {
239+
ready = true
240+
break
241+
}
242+
}
243+
if ready {
244+
break
245+
}
246+
}
247+
}
248+
g.Expect(ready).To(gomega.BeTrue(), "Pod on target node is not ready")
249+
}, pollTimeoutRule, pollIntervalRule).Should(gomega.Succeed(), "timed out waiting for pod to become ready")
250+
251+
ginkgo.By("Verifying Pod 1 still has correct exclusive CPU mask after NRI restart")
252+
alloc1Up := getTesterPodCPUAllocation(fxt.K8SClientset, ctx, createdPod1)
253+
gomega.Expect(alloc1Up.CPUAssigned).To(gomega.Equal(exclusiveCPUs), "Pod 1 CPU mask changed after NRI restarted")
254+
255+
ginkgo.By("Verifying Pod 2 CPU mask IS restricted to shared pool (excludes Pod 1 CPUs)")
256+
gomega.Eventually(func(g gomega.Gomega) {
257+
alloc2Updated := getTesterPodCPUAllocation(fxt.K8SClientset, ctx, createdPod2)
258+
fxt.Log.Info("Pod 2 CPU allocation (after NRI recovery)", "cpuAssigned", alloc2Updated.CPUAssigned.String())
259+
// pod2 must NOT contain the exclusive CPUs now
260+
g.Expect(alloc2Updated.CPUAssigned.Intersection(exclusiveCPUs).IsEmpty()).To(gomega.BeTrue(), "Pod 2 still has access to exclusive CPUs")
261+
}, pollTimeoutRule, pollIntervalRule).Should(gomega.Succeed(), "timed out waiting for Pod 2 CPU mask update")
262+
})
263+
})

0 commit comments

Comments
 (0)