From a06db5039d12abcba666e64f6888a8bc8f7c1209 Mon Sep 17 00:00:00 2001 From: Pranshu Srivastava Date: Fri, 26 Dec 2025 04:01:36 +0530 Subject: [PATCH 1/3] MON-4389: Add tests for `telemetry` collection profile Adds tests corresponding to the changes done in https://github.com/openshift/cluster-monitoring-operator/pull/2694 Signed-off-by: Pranshu Srivastava --- .../prometheus/collection_profiles.go | 181 +++++++++++++----- 1 file changed, 128 insertions(+), 53 deletions(-) diff --git a/test/extended/prometheus/collection_profiles.go b/test/extended/prometheus/collection_profiles.go index 9e385fa825d6..8839947405c2 100644 --- a/test/extended/prometheus/collection_profiles.go +++ b/test/extended/prometheus/collection_profiles.go @@ -28,6 +28,7 @@ const ( collectionProfileFull = "full" collectionProfileDefault = collectionProfileFull collectionProfileMinimal = "minimal" + collectionProfileTelemetry = "telemetry" collectionProfileNone = "" operatorName = "cluster-monitoring-operator" @@ -42,6 +43,7 @@ var ( collectionProfilesSupportedList = []string{ collectionProfileFull, collectionProfileMinimal, + collectionProfileTelemetry, } ) @@ -63,9 +65,6 @@ var _ = g.Describe("[sig-instrumentation][OCPFeatureGate:MetricsCollectionProfil tctx := context.Background() g.BeforeAll(func() { - if !exutil.IsTechPreviewNoUpgrade(tctx, oc.AdminConfigClient()) { - g.Skip("skipping, this feature is only supported on TechPreviewNoUpgrade clusters") - } var err error r.kclient, err = kubernetes.NewForConfig(oc.AdminConfig()) if err != nil { @@ -235,76 +234,78 @@ var _ = g.Describe("[sig-instrumentation][OCPFeatureGate:MetricsCollectionProfil }) g.It("should hide default metrics", func() { - appNameSelector := "app.kubernetes.io/name" - appName := "kube-state-metrics" + r.compareComponentRelabellingsToCountForProfile(tctx, "app.kubernetes.io/name", "kube-state-metrics", "kube_", profile) + }) + }) + + g.Context("in a homogeneous telemetry environment,", func() { + profile := collectionProfileTelemetry + + g.BeforeAll(func() { + if enabledErr, err := telemetryIsEnabled(tctx, r.kclient); err != nil { + g.Fail(fmt.Sprintf("failed to determine if telemetry is enabled: %v", err)) + } else if enabledErr != nil { + g.Skip("telemetry is not enabled on this cluster, skipping telemetry collection profile tests") + } - var kubeStateMetricsMonitor *prometheusoperatorv1.ServiceMonitor + err := r.makeCollectionProfileConfigurationFor(tctx, profile) + o.Expect(err).To(o.BeNil()) o.Eventually(func() error { - monitors, err := r.fetchMonitorsFor(tctx, [2]string{collectionProfileFeatureLabel, profile}, [2]string{appNameSelector, appName}) + enabled, err := r.isProfileEnabled(tctx, profile) if err != nil { return err } - if len(monitors.Items) == 0 { - return fmt.Errorf("no monitors found with collection profile: %q and %#v=%q", profile, appNameSelector, appName) - } - if len(monitors.Items) > 1 { - return fmt.Errorf("more than one monitor found with collection profile: %q and %#v=%q", profile, appNameSelector, appName) + if !enabled { + return fmt.Errorf("collection profile %q is not enabled", profile) } - kubeStateMetricsMonitor = monitors.Items[0] return nil }, pollTimeout, pollInterval).Should(o.BeNil()) + }) - var kubeStateMetricsMainMetrics []string - kubeStateMetricsMonitorSpec := kubeStateMetricsMonitor.Spec - kubeStateMetricsMonitorSpecEndpoints := kubeStateMetricsMonitorSpec.Endpoints - if len(kubeStateMetricsMonitorSpecEndpoints) != 0 { - kubeStateMetricsMonitorSpecEndpoints0Relabelings := kubeStateMetricsMonitorSpecEndpoints[0].MetricRelabelConfigs - if len(kubeStateMetricsMonitorSpecEndpoints0Relabelings) != 0 { - for _, relabeling := range kubeStateMetricsMonitorSpecEndpoints0Relabelings { - // NOTE: This should accommodate for future changes to the relabeling scope. - if relabeling.Action == "keep" && - len(relabeling.SourceLabels) == 1 && - relabeling.SourceLabels[0] == "__name__" { - regexpString := relabeling.Regex - kubeRegex := regexp.MustCompile(`(?U)(kube_.*)[|,)]`) - kubeMetrics := kubeRegex.FindAllString(regexpString, -1) - for _, metric := range kubeMetrics { - // Golang doesn't support negative lookaheads. - if strings.HasPrefix(metric, "kube_state_metrics") { - continue - } - kubeStateMetricsMainMetrics = append(kubeStateMetricsMainMetrics, metric) - } - } - } - } + g.It("should hide default metrics", func() { + r.compareComponentRelabellingsToCountForProfile(tctx, "app.kubernetes.io/name", "kube-state-metrics", "kube_", profile) + }) + + // this test case ensures that the (a) opted-in (in-cluster components or + // otherwise) or (b) full/none/default collection profile monitors + // collectively expose the same volume of telemetry metrics as they did + // before + g.It("should not drop any telemetry metric", func() { + telemetryConfigMap, err := r.kclient.CoreV1().ConfigMaps("openshift-monitoring").Get(tctx, "telemetry-config", metav1.GetOptions{}) + o.Expect(err).To(o.BeNil()) + + var telemetryConfig struct { + Matches []string `yaml:"matches"` + } + err = yaml.Unmarshal([]byte(telemetryConfigMap.Data["metrics.yaml"]), &telemetryConfig) + o.Expect(err).To(o.BeNil()) + + var telemetryMetricsCountQuery string + for _, match := range telemetryConfig.Matches { + telemetryMetricsCountQuery += fmt.Sprintf("%s or ", match) } - o.Expect(len(kubeStateMetricsMainMetrics)).To(o.BeNumerically(">", 0)) + telemetryMetricsCountQuery = fmt.Sprintf("count(%s)", telemetryMetricsCountQuery[:len(telemetryMetricsCountQuery)-4]) o.Eventually(func() error { - postRelabelingMetric := "scrape_samples_post_metric_relabeling" - relabelingMetricQuery := fmt.Sprintf("sum(%s{job=\"%s\",endpoint=\"https-main\",namespace=\"%s\"})", postRelabelingMetric, appName, operatorNamespaceName) - queryResponse, err := helper.RunQuery(tctx, r.pclient, relabelingMetricQuery) + telemetryMetricsCountQueryResponse, err := helper.RunQuery(tctx, r.pclient, telemetryMetricsCountQuery) if err != nil { - return err + return fmt.Errorf("failed to run constructed telemetry metrics query: %v", err) } - if len(queryResponse.Data.Result) == 0 { - return fmt.Errorf("no result found for metric %q", postRelabelingMetric) + if len(telemetryMetricsCountQueryResponse.Data.Result) == 0 { + return fmt.Errorf("no result found for constructed telemetry metrics query") } - wantCount := int(queryResponse.Data.Result[0].Value) + wantCount := int(telemetryMetricsCountQueryResponse.Data.Result[0].Value) - kubeStateMetricsMainMetricsString := strings.Join(kubeStateMetricsMainMetrics, "") - kubeStateMetricsMainMetricsCountQuery := fmt.Sprintf("count({__name__=~\"%s\"})", kubeStateMetricsMainMetricsString[:len(kubeStateMetricsMainMetricsString)-1 /* drop the last "|" or ")" */]) - queryResponse, err = helper.RunQuery(tctx, r.pclient, kubeStateMetricsMainMetricsCountQuery) + telemetrySelectedSeriesCountQuery := "cluster:telemetry_selected_series:count" + telemetrySelectedSeriesCountQueryResponse, err := helper.RunQuery(tctx, r.pclient, telemetrySelectedSeriesCountQuery) if err != nil { - return err + return fmt.Errorf("failed to run metric %q: %v", telemetrySelectedSeriesCountQuery, err) } - if len(queryResponse.Data.Result) == 0 { - return fmt.Errorf("no result found for metric %q", kubeStateMetricsMainMetricsCountQuery) + if len(telemetrySelectedSeriesCountQueryResponse.Data.Result) == 0 { + return fmt.Errorf("no result found for metric %q", telemetrySelectedSeriesCountQuery) } - gotCount := int(queryResponse.Data.Result[0].Value) - + gotCount := int(telemetrySelectedSeriesCountQueryResponse.Data.Result[0].Value) if gotCount != wantCount { return fmt.Errorf("got %v, want %v", gotCount, wantCount) } @@ -315,6 +316,80 @@ var _ = g.Describe("[sig-instrumentation][OCPFeatureGate:MetricsCollectionProfil }) }) +func (r runner) compareComponentRelabellingsToCountForProfile(tctx context.Context, appNameSelector, appName, metricSubsystem, profile string) { + var serviceMonitor *prometheusoperatorv1.ServiceMonitor + o.Eventually(func() error { + monitors, err := r.fetchMonitorsFor(tctx, [2]string{collectionProfileFeatureLabel, profile}, [2]string{appNameSelector, appName}) + if err != nil { + return err + } + if len(monitors.Items) == 0 { + return fmt.Errorf("no monitors found with collection profile: %q and %#v=%q", profile, appNameSelector, appName) + } + if len(monitors.Items) > 1 { + return fmt.Errorf("more than one monitor found with collection profile: %q and %#v=%q", profile, appNameSelector, appName) + } + serviceMonitor = monitors.Items[0] + + return nil + }, pollTimeout, pollInterval).Should(o.BeNil()) + + var metrics []string + spec := serviceMonitor.Spec + specEndpoints := spec.Endpoints + if len(specEndpoints) != 0 { + relabelings := specEndpoints[0].MetricRelabelConfigs + if len(relabelings) != 0 { + for _, relabeling := range relabelings { + if relabeling.Action == "keep" && + len(relabeling.SourceLabels) == 1 && + relabeling.SourceLabels[0] == "__name__" { + regexpString := relabeling.Regex + subsystemRegex := regexp.MustCompile("(?U)(" + metricSubsystem + ".*)[|,)]") + subsystemMetrics := subsystemRegex.FindAllString(regexpString, -1) + for _, metric := range subsystemMetrics { + if strings.HasPrefix(metric, strings.ReplaceAll(appName, "-", "_")) { + continue + } + metrics = append(metrics, metric) + } + } + } + } + } + o.Expect(len(metrics)).To(o.BeNumerically(">", 0)) + + o.Eventually(func() error { + postRelabelingMetric := "scrape_samples_post_metric_relabeling" + relabelingMetricQuery := fmt.Sprintf("sum(%s{job=\"%s\",endpoint=\"https-main\",namespace=\"%s\"})", postRelabelingMetric, appName, operatorNamespaceName) + queryResponse, err := helper.RunQuery(tctx, r.pclient, relabelingMetricQuery) + if err != nil { + return err + } + if len(queryResponse.Data.Result) == 0 { + return fmt.Errorf("no result found for metric %q", postRelabelingMetric) + } + wantCount := int(queryResponse.Data.Result[0].Value) + + metricsString := strings.Join(metrics, "") + metricsCountQuery := fmt.Sprintf("count({__name__=~\"%s\"})", metricsString[:len(metricsString)-1 /* drop the last "|" or ")" */]) + queryResponse, err = helper.RunQuery(tctx, r.pclient, metricsCountQuery) + if err != nil { + return err + } + if len(queryResponse.Data.Result) == 0 { + return fmt.Errorf("no result found for metric %q", metricsCountQuery) + } + gotCount := int(queryResponse.Data.Result[0].Value) + + if gotCount != wantCount { + return fmt.Errorf("got %v, want %v", gotCount, wantCount) + } + + return nil + }, pollTimeout, pollInterval).Should(o.BeNil()) +} + func (r runner) isProfileEnabled(ctx context.Context, profile string) (bool, error) { vectorExpression := "max(profile:cluster_monitoring_operator_collection_profile:max{profile=\"%s\"}) == 1" queryResponse, err := helper.RunQuery(ctx, r.pclient, fmt.Sprintf(vectorExpression, profile)) From c44d2e2aa16ad54651217189413cbaa7a374afb1 Mon Sep 17 00:00:00 2001 From: Pranshu Srivastava Date: Mon, 29 Dec 2025 03:08:29 +0530 Subject: [PATCH 2/3] fixup! MON-4389: Add tests for `telemetry` collection profile --- .../prometheus/collection_profiles.go | 36 ++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/test/extended/prometheus/collection_profiles.go b/test/extended/prometheus/collection_profiles.go index 8839947405c2..d9dfee7e7173 100644 --- a/test/extended/prometheus/collection_profiles.go +++ b/test/extended/prometheus/collection_profiles.go @@ -83,7 +83,7 @@ var _ = g.Describe("[sig-instrumentation][OCPFeatureGate:MetricsCollectionProfil if errors.IsNotFound(err) { g.By("initially, creating a configuration for the operator as it did not exist") operatorConfiguration = nil - return r.makeCollectionProfileConfigurationFor(tctx, collectionProfileDefault) + return r.makeCollectionProfileConfigurationFor(tctx, collectionProfileDefault, false) } return err @@ -126,7 +126,7 @@ var _ = g.Describe("[sig-instrumentation][OCPFeatureGate:MetricsCollectionProfil profile := collectionProfileDefault g.BeforeAll(func() { - err := r.makeCollectionProfileConfigurationFor(tctx, profile) + err := r.makeCollectionProfileConfigurationFor(tctx, profile, false) o.Expect(err).To(o.BeNil()) o.Eventually(func() error { enabled, err := r.isProfileEnabled(tctx, profile) @@ -161,7 +161,7 @@ var _ = g.Describe("[sig-instrumentation][OCPFeatureGate:MetricsCollectionProfil g.Context("in a heterogeneous environment,", func() { g.It("should expose information about the applied collection profile using meta-metrics", func() { for _, profile := range collectionProfilesSupportedList { - err := r.makeCollectionProfileConfigurationFor(tctx, profile) + err := r.makeCollectionProfileConfigurationFor(tctx, profile, false) o.Expect(err).To(o.BeNil()) o.Eventually(func() error { @@ -180,7 +180,7 @@ var _ = g.Describe("[sig-instrumentation][OCPFeatureGate:MetricsCollectionProfil }) g.It("should have at least one implementation for each collection profile", func() { for _, profile := range collectionProfilesSupportedList { - err := r.makeCollectionProfileConfigurationFor(tctx, profile) + err := r.makeCollectionProfileConfigurationFor(tctx, profile, false) o.Expect(err).To(o.BeNil()) o.Eventually(func() error { @@ -197,7 +197,7 @@ var _ = g.Describe("[sig-instrumentation][OCPFeatureGate:MetricsCollectionProfil } }) g.It("should revert to default collection profile when an empty collection profile value is specified", func() { - err := r.makeCollectionProfileConfigurationFor(tctx, collectionProfileNone) + err := r.makeCollectionProfileConfigurationFor(tctx, collectionProfileNone, false) o.Expect(err).To(o.BeNil()) o.Eventually(func() error { @@ -218,7 +218,7 @@ var _ = g.Describe("[sig-instrumentation][OCPFeatureGate:MetricsCollectionProfil profile := collectionProfileMinimal g.BeforeAll(func() { - err := r.makeCollectionProfileConfigurationFor(tctx, profile) + err := r.makeCollectionProfileConfigurationFor(tctx, profile, false) o.Expect(err).To(o.BeNil()) o.Eventually(func() error { enabled, err := r.isProfileEnabled(tctx, profile) @@ -242,22 +242,21 @@ var _ = g.Describe("[sig-instrumentation][OCPFeatureGate:MetricsCollectionProfil profile := collectionProfileTelemetry g.BeforeAll(func() { - if enabledErr, err := telemetryIsEnabled(tctx, r.kclient); err != nil { - g.Fail(fmt.Sprintf("failed to determine if telemetry is enabled: %v", err)) - } else if enabledErr != nil { - g.Skip("telemetry is not enabled on this cluster, skipping telemetry collection profile tests") - } - - err := r.makeCollectionProfileConfigurationFor(tctx, profile) + err := r.makeCollectionProfileConfigurationFor(tctx, profile, true) o.Expect(err).To(o.BeNil()) o.Eventually(func() error { enabled, err := r.isProfileEnabled(tctx, profile) if err != nil { - return err + return fmt.Errorf("encountered error while checking if profile %q is enabled: %v", profile, err) } if !enabled { return fmt.Errorf("collection profile %q is not enabled", profile) } + if enabledErr, err := telemetryIsEnabled(tctx, r.kclient); err != nil { + return fmt.Errorf("failed to determine if telemetry is enabled: %v", err) + } else if enabledErr != nil { + return fmt.Errorf("telemetry is not enabled") + } return nil }, pollTimeout, pollInterval).Should(o.BeNil()) @@ -307,7 +306,7 @@ var _ = g.Describe("[sig-instrumentation][OCPFeatureGate:MetricsCollectionProfil } gotCount := int(telemetrySelectedSeriesCountQueryResponse.Data.Result[0].Value) if gotCount != wantCount { - return fmt.Errorf("got %v, want %v", gotCount, wantCount) + return fmt.Errorf("compared %s against %s: got %v, want %v", telemetrySelectedSeriesCountQuery, telemetryMetricsCountQuery, gotCount, wantCount) } return nil @@ -415,7 +414,7 @@ func (r runner) fetchMonitorsFor(ctx context.Context, selectors ...[2]string) (* }) } -func (r runner) makeCollectionProfileConfigurationFor(ctx context.Context, collectionProfile string) error { +func (r runner) makeCollectionProfileConfigurationFor(ctx context.Context, collectionProfile string, enableTelemetry bool) error { dataConfigYAMLPrometheusK8s := fmt.Sprintf("collectionProfile: %s", collectionProfile) dataConfigYAMLPrometheusK8sStructured := map[string]interface{}{ "collectionProfile": collectionProfile, @@ -453,6 +452,11 @@ func (r runner) makeCollectionProfileConfigurationFor(ctx context.Context, colle } else { gotDataConfigYAMLMap["prometheusK8s"].(map[string]interface{})["collectionProfile"] = collectionProfile } + if enableTelemetry { + if _, ok := gotDataConfigYAMLMap["telemeterClient"]; ok { + gotDataConfigYAMLMap["telemeterClient"].(map[string]interface{})["enabled"] = true + } + } gotDataConfigYAMLRaw, err := yaml.Marshal(gotDataConfigYAMLMap) if err != nil { return err From 6ddd5e49df6da1d47d57c3a55d54d87d6e01094b Mon Sep 17 00:00:00 2001 From: Pranshu Srivastava Date: Mon, 29 Dec 2025 17:58:19 +0530 Subject: [PATCH 3/3] fixup! fixup! MON-4389: Add tests for `telemetry` collection profile --- test/extended/prometheus/collection_profiles.go | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/test/extended/prometheus/collection_profiles.go b/test/extended/prometheus/collection_profiles.go index d9dfee7e7173..dd0b8f47f8db 100644 --- a/test/extended/prometheus/collection_profiles.go +++ b/test/extended/prometheus/collection_profiles.go @@ -294,7 +294,7 @@ var _ = g.Describe("[sig-instrumentation][OCPFeatureGate:MetricsCollectionProfil if len(telemetryMetricsCountQueryResponse.Data.Result) == 0 { return fmt.Errorf("no result found for constructed telemetry metrics query") } - wantCount := int(telemetryMetricsCountQueryResponse.Data.Result[0].Value) + gotCount := int(telemetryMetricsCountQueryResponse.Data.Result[0].Value) telemetrySelectedSeriesCountQuery := "cluster:telemetry_selected_series:count" telemetrySelectedSeriesCountQueryResponse, err := helper.RunQuery(tctx, r.pclient, telemetrySelectedSeriesCountQuery) @@ -304,9 +304,18 @@ var _ = g.Describe("[sig-instrumentation][OCPFeatureGate:MetricsCollectionProfil if len(telemetrySelectedSeriesCountQueryResponse.Data.Result) == 0 { return fmt.Errorf("no result found for metric %q", telemetrySelectedSeriesCountQuery) } - gotCount := int(telemetrySelectedSeriesCountQueryResponse.Data.Result[0].Value) - if gotCount != wantCount { - return fmt.Errorf("compared %s against %s: got %v, want %v", telemetrySelectedSeriesCountQuery, telemetryMetricsCountQuery, gotCount, wantCount) + wantCount := int(telemetrySelectedSeriesCountQueryResponse.Data.Result[0].Value) + largerCount, smallerCount := gotCount, wantCount + if wantCount > gotCount { + largerCount, smallerCount = smallerCount, largerCount + } + seriesDifference := float64(largerCount - smallerCount) + permittedVariance := 0.05 + if largerCount == 0 { + return fmt.Errorf("both telemetry metric count and telemetry selected series count are zero") + } + if seriesDifference/float64(largerCount) > permittedVariance { + return fmt.Errorf("compared %q against %q: want %v, got %v", telemetrySelectedSeriesCountQuery, telemetryMetricsCountQuery, wantCount, gotCount) } return nil