-
Notifications
You must be signed in to change notification settings - Fork 4.8k
Add a monitortest for cluster region/zone/instance type autodl data #31305
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,248 @@ | ||
| package clusterinstancetypes | ||
|
|
||
| import ( | ||
| "context" | ||
| "encoding/json" | ||
| "fmt" | ||
| "path/filepath" | ||
| "sort" | ||
| "strings" | ||
| "time" | ||
|
|
||
| configv1 "github.com/openshift/api/config/v1" | ||
| machinev1beta1 "github.com/openshift/api/machine/v1beta1" | ||
| configclient "github.com/openshift/client-go/config/clientset/versioned" | ||
| machineclient "github.com/openshift/client-go/machine/clientset/versioned" | ||
| "github.com/openshift/origin/pkg/dataloader" | ||
| "github.com/openshift/origin/pkg/monitor/monitorapi" | ||
| "github.com/openshift/origin/pkg/monitortestframework" | ||
| "github.com/openshift/origin/pkg/test/ginkgo/junitapi" | ||
| "github.com/sirupsen/logrus" | ||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
| "k8s.io/client-go/kubernetes" | ||
| "k8s.io/client-go/rest" | ||
| ) | ||
|
|
||
| type clusterInstanceTypes struct { | ||
| adminRESTConfig *rest.Config | ||
| data []instanceTypeRow | ||
| } | ||
|
|
||
| type instanceTypeRow struct { | ||
| Platform string `json:"platform"` | ||
| Region string `json:"region"` | ||
| Role string `json:"role"` | ||
| InstanceType string `json:"instance_type"` | ||
| } | ||
|
|
||
| func NewClusterInstanceTypes() monitortestframework.MonitorTest { | ||
| return &clusterInstanceTypes{} | ||
| } | ||
|
|
||
| func (w *clusterInstanceTypes) PrepareCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error { | ||
| return nil | ||
| } | ||
|
|
||
| func (w *clusterInstanceTypes) StartCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error { | ||
| w.adminRESTConfig = adminRESTConfig | ||
| return nil | ||
| } | ||
|
|
||
| func (w *clusterInstanceTypes) CollectData(ctx context.Context, storageDir string, beginning, end time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) { | ||
| logger := logrus.WithField("MonitorTest", "ClusterInstanceTypes") | ||
|
|
||
| data, err := w.collect(ctx) | ||
| if err != nil { | ||
| logger.WithError(err).Warn("failed to collect instance type data") | ||
| return nil, nil, nil | ||
| } | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| w.data = data | ||
| return nil, nil, nil | ||
| } | ||
|
|
||
| func (*clusterInstanceTypes) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) { | ||
| return nil, nil | ||
| } | ||
|
|
||
| func (*clusterInstanceTypes) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) { | ||
| return nil, nil | ||
| } | ||
|
|
||
| func (w *clusterInstanceTypes) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error { | ||
| if len(w.data) == 0 { | ||
| return nil | ||
| } | ||
|
|
||
| rows := make([]map[string]string, 0, len(w.data)) | ||
| for _, r := range w.data { | ||
| rows = append(rows, map[string]string{ | ||
| "Platform": r.Platform, | ||
| "Region": r.Region, | ||
| "Role": r.Role, | ||
| "InstanceType": r.InstanceType, | ||
| }) | ||
| } | ||
|
|
||
| dataFile := dataloader.DataFile{ | ||
| TableName: "cluster_instance_types", | ||
| Schema: map[string]dataloader.DataType{ | ||
| "Platform": dataloader.DataTypeString, | ||
| "Region": dataloader.DataTypeString, | ||
| "Role": dataloader.DataTypeString, | ||
| "InstanceType": dataloader.DataTypeString, | ||
| }, | ||
| Rows: rows, | ||
| } | ||
|
|
||
| fileName := filepath.Join(storageDir, fmt.Sprintf("cluster-instance-types%s-%s", timeSuffix, dataloader.AutoDataLoaderSuffix)) | ||
| if err := dataloader.WriteDataFile(fileName, dataFile); err != nil { | ||
| return fmt.Errorf("failed to write instance types autodl: %w", err) | ||
| } | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| func (*clusterInstanceTypes) Cleanup(ctx context.Context) error { | ||
| return nil | ||
| } | ||
|
|
||
| func (w *clusterInstanceTypes) collect(ctx context.Context) ([]instanceTypeRow, error) { | ||
| configClient, err := configclient.NewForConfig(w.adminRESTConfig) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("failed to create config client: %w", err) | ||
| } | ||
|
|
||
| infra, err := configClient.ConfigV1().Infrastructures().Get(ctx, "cluster", metav1.GetOptions{}) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("failed to get infrastructure: %w", err) | ||
| } | ||
|
|
||
| if infra.Status.PlatformStatus == nil { | ||
| logrus.Info("skipping instance type collection: platform status not set") | ||
| return nil, nil | ||
| } | ||
|
|
||
| platform := strings.ToLower(string(infra.Status.PlatformStatus.Type)) | ||
| if platform != "aws" && platform != "azure" && platform != "gcp" { | ||
| logrus.WithField("platform", platform).Info("skipping instance type collection for unsupported platform") | ||
| return nil, nil | ||
| } | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
|
|
||
| // Azure doesn't expose region in Infrastructure CR, so we always fall back to node labels | ||
| region := getRegionFromInfrastructure(infra) | ||
| if region == "" { | ||
| kubeClient, err := kubernetes.NewForConfig(w.adminRESTConfig) | ||
| if err != nil { | ||
| logrus.WithError(err).Warn("failed to create kube client for region fallback") | ||
| } else { | ||
| nodes, err := kubeClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) | ||
| if err != nil { | ||
| logrus.WithError(err).Warn("failed to list nodes for region fallback") | ||
| } else if len(nodes.Items) > 0 { | ||
| region = nodes.Items[0].Labels["topology.kubernetes.io/region"] | ||
| } | ||
| } | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| } | ||
|
|
||
| machineClientSet, err := machineclient.NewForConfig(w.adminRESTConfig) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("failed to create machine client: %w", err) | ||
| } | ||
|
|
||
| machines, err := machineClientSet.MachineV1beta1().Machines("openshift-machine-api").List(ctx, metav1.ListOptions{}) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("failed to list machines: %w", err) | ||
| } | ||
|
Comment on lines
+152
to
+155
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can get all the machine details from node labels. For example, we can extract the arch, instance type and region for all 3 aws, azure and gcp platforms👇: kubernetes.io/arch: <arch>
node.kubernetes.io/instance-type: <type>
topology.kubernetes.io/region: <region>For control plane nodes, they will have these labels: node-role.kubernetes.io/control-plane: ""
node-role.kubernetes.io/master: ""For worker nodes, they will have: node-role.kubernetes.io/worker: ""
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My thought is that we likely need to switch to check ClusterAPI machines in a near future. So if we can inspect nodes, it'd ease our maintenance. WDYT? |
||
|
|
||
| return buildRows(platform, region, machines.Items), nil | ||
| } | ||
|
|
||
| func buildRows(platform, region string, machines []machinev1beta1.Machine) []instanceTypeRow { | ||
| seen := map[string]bool{} | ||
| var result []instanceTypeRow | ||
|
|
||
| for i := range machines { | ||
| machine := &machines[i] | ||
| role := "worker" | ||
| if isMaster(machine) { | ||
| role = "control-plane" | ||
| } | ||
| instanceType := extractInstanceType(platform, machine) | ||
| if instanceType == "" { | ||
| continue | ||
| } | ||
| key := role + "/" + instanceType | ||
| if seen[key] { | ||
| continue | ||
| } | ||
| seen[key] = true | ||
| result = append(result, instanceTypeRow{ | ||
| Platform: platform, | ||
| Region: region, | ||
| Role: role, | ||
| InstanceType: instanceType, | ||
| }) | ||
| } | ||
|
|
||
| sort.Slice(result, func(i, j int) bool { | ||
| if result[i].Role != result[j].Role { | ||
| return result[i].Role < result[j].Role | ||
| } | ||
| return result[i].InstanceType < result[j].InstanceType | ||
| }) | ||
|
|
||
| return result | ||
| } | ||
|
|
||
| func getRegionFromInfrastructure(infra *configv1.Infrastructure) string { | ||
| if infra.Status.PlatformStatus == nil { | ||
| return "" | ||
| } | ||
| switch infra.Status.PlatformStatus.Type { | ||
| case configv1.AWSPlatformType: | ||
| if infra.Status.PlatformStatus.AWS != nil { | ||
| return infra.Status.PlatformStatus.AWS.Region | ||
| } | ||
| case configv1.GCPPlatformType: | ||
| if infra.Status.PlatformStatus.GCP != nil { | ||
| return infra.Status.PlatformStatus.GCP.Region | ||
| } | ||
| } | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| return "" | ||
| } | ||
|
|
||
| func isMaster(machine *machinev1beta1.Machine) bool { | ||
| return machine.Labels["machine.openshift.io/cluster-api-machine-role"] == "master" | ||
| } | ||
|
|
||
| func extractInstanceType(platform string, machine *machinev1beta1.Machine) string { | ||
| if machine.Spec.ProviderSpec.Value == nil { | ||
| return "" | ||
| } | ||
| raw := machine.Spec.ProviderSpec.Value.Raw | ||
|
|
||
| switch platform { | ||
| case "aws": | ||
| var spec machinev1beta1.AWSMachineProviderConfig | ||
| if err := json.Unmarshal(raw, &spec); err != nil { | ||
| logrus.WithError(err).WithField("machine", machine.Name).Warn("failed to unmarshal AWS provider spec") | ||
| return "" | ||
| } | ||
| return spec.InstanceType | ||
| case "azure": | ||
| var spec machinev1beta1.AzureMachineProviderSpec | ||
| if err := json.Unmarshal(raw, &spec); err != nil { | ||
| logrus.WithError(err).WithField("machine", machine.Name).Warn("failed to unmarshal Azure provider spec") | ||
| return "" | ||
| } | ||
| return spec.VMSize | ||
| case "gcp": | ||
| var spec machinev1beta1.GCPMachineProviderSpec | ||
| if err := json.Unmarshal(raw, &spec); err != nil { | ||
| logrus.WithError(err).WithField("machine", machine.Name).Warn("failed to unmarshal GCP provider spec") | ||
| return "" | ||
| } | ||
| return spec.MachineType | ||
| } | ||
| return "" | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,129 @@ | ||
| package clusterinstancetypes | ||
|
|
||
| import ( | ||
| "encoding/json" | ||
| "testing" | ||
|
|
||
| machinev1beta1 "github.com/openshift/api/machine/v1beta1" | ||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
| "k8s.io/apimachinery/pkg/runtime" | ||
| ) | ||
|
|
||
| func awsMachine(t *testing.T, name, role, instanceType string) machinev1beta1.Machine { | ||
| t.Helper() | ||
| providerSpec := machinev1beta1.AWSMachineProviderConfig{ | ||
| InstanceType: instanceType, | ||
| } | ||
| raw, err := json.Marshal(providerSpec) | ||
| if err != nil { | ||
| t.Fatalf("failed to marshal provider spec: %v", err) | ||
| } | ||
| return machinev1beta1.Machine{ | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Name: name, | ||
| Labels: map[string]string{"machine.openshift.io/cluster-api-machine-role": role}, | ||
| }, | ||
| Spec: machinev1beta1.MachineSpec{ | ||
| ProviderSpec: machinev1beta1.ProviderSpec{ | ||
| Value: &runtime.RawExtension{Raw: raw}, | ||
| }, | ||
| }, | ||
| } | ||
| } | ||
|
|
||
| func TestBuildRowsDeduplicates(t *testing.T) { | ||
| machines := []machinev1beta1.Machine{ | ||
| awsMachine(t, "master-0", "master", "m6i.xlarge"), | ||
| awsMachine(t, "master-1", "master", "m6i.xlarge"), | ||
| awsMachine(t, "master-2", "master", "m6i.xlarge"), | ||
| awsMachine(t, "worker-0", "worker", "m6i.2xlarge"), | ||
| awsMachine(t, "worker-1", "worker", "m6i.2xlarge"), | ||
| awsMachine(t, "worker-2", "worker", "m6i.2xlarge"), | ||
| } | ||
|
|
||
| rows := buildRows("aws", "us-east-1", machines) | ||
|
|
||
| if len(rows) != 2 { | ||
| t.Fatalf("expected 2 deduplicated rows, got %d: %+v", len(rows), rows) | ||
| } | ||
| if rows[0].Role != "control-plane" || rows[0].InstanceType != "m6i.xlarge" { | ||
| t.Errorf("unexpected control-plane row: %+v", rows[0]) | ||
| } | ||
| if rows[1].Role != "worker" || rows[1].InstanceType != "m6i.2xlarge" { | ||
| t.Errorf("unexpected worker row: %+v", rows[1]) | ||
| } | ||
| } | ||
|
|
||
| func TestBuildRowsMixedWorkerTypes(t *testing.T) { | ||
| machines := []machinev1beta1.Machine{ | ||
| awsMachine(t, "master-0", "master", "m6i.xlarge"), | ||
| awsMachine(t, "worker-0", "worker", "m5.xlarge"), | ||
| awsMachine(t, "worker-1", "worker", "m6i.2xlarge"), | ||
| awsMachine(t, "worker-2", "worker", "m5.xlarge"), | ||
| } | ||
|
|
||
| rows := buildRows("aws", "us-east-1", machines) | ||
|
|
||
| if len(rows) != 3 { | ||
| t.Fatalf("expected 3 rows (1 cp + 2 distinct worker types), got %d: %+v", len(rows), rows) | ||
| } | ||
| if rows[0].Role != "control-plane" { | ||
| t.Errorf("first row should be control-plane, got %+v", rows[0]) | ||
| } | ||
| workerTypes := map[string]bool{} | ||
| for _, r := range rows[1:] { | ||
| if r.Role != "worker" { | ||
| t.Errorf("expected worker role, got %+v", r) | ||
| } | ||
| workerTypes[r.InstanceType] = true | ||
| } | ||
| if !workerTypes["m5.xlarge"] || !workerTypes["m6i.2xlarge"] { | ||
| t.Errorf("expected both worker types present, got %v", workerTypes) | ||
| } | ||
| } | ||
|
|
||
| func TestBuildRowsSortsControlPlaneFirst(t *testing.T) { | ||
| machines := []machinev1beta1.Machine{ | ||
| awsMachine(t, "worker-0", "worker", "m5.xlarge"), | ||
| awsMachine(t, "master-0", "master", "m6i.xlarge"), | ||
| } | ||
|
|
||
| rows := buildRows("aws", "us-east-1", machines) | ||
|
|
||
| if len(rows) != 2 { | ||
| t.Fatalf("expected 2 rows, got %d", len(rows)) | ||
| } | ||
| if rows[0].Role != "control-plane" { | ||
| t.Errorf("control-plane should sort first, got %+v", rows[0]) | ||
| } | ||
| } | ||
|
|
||
| func TestBuildRowsPropagatesPlatformAndRegion(t *testing.T) { | ||
| machines := []machinev1beta1.Machine{ | ||
| awsMachine(t, "master-0", "master", "m6i.xlarge"), | ||
| } | ||
|
|
||
| rows := buildRows("aws", "eu-west-1", machines) | ||
|
|
||
| if rows[0].Platform != "aws" || rows[0].Region != "eu-west-1" { | ||
| t.Errorf("expected platform=aws region=eu-west-1, got %+v", rows[0]) | ||
| } | ||
| } | ||
|
|
||
| func TestBuildRowsSkipsEmptyProviderSpec(t *testing.T) { | ||
| machines := []machinev1beta1.Machine{ | ||
| awsMachine(t, "master-0", "master", "m6i.xlarge"), | ||
| { | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Name: "worker-no-spec", | ||
| Labels: map[string]string{"machine.openshift.io/cluster-api-machine-role": "worker"}, | ||
| }, | ||
| }, | ||
| } | ||
|
|
||
| rows := buildRows("aws", "us-east-1", machines) | ||
|
|
||
| if len(rows) != 1 { | ||
| t.Fatalf("expected 1 row (worker with no spec skipped), got %d: %+v", len(rows), rows) | ||
| } | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
On AWS, we also support local and wavelength zones via edge compute pool.
Let's also collect zone details (for AWS only?). We can get it by inspecting node label
topology.kubernetes.io/zone, for example, in this run: