-
Notifications
You must be signed in to change notification settings - Fork 128
Telemetry enhancement #1677
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Telemetry enhancement #1677
Changes from all commits
edf2618
656737b
930784e
694e766
a01170d
7092e25
f7c5c88
bcf5434
ddd3186
d0e0f5e
2dc1851
82480ef
dc08f44
381df2c
0009f50
9c8dcbd
76a0110
bb988ff
23ac8e8
9e6903b
06ca952
77df860
c977bdc
5306a09
d7d9a71
02eb7aa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: manager-telemetry | ||
| data: | ||
| status: | | ||
| { | ||
| "lastTransmission": "", | ||
| "test": "false", | ||
| "sokVersion": "3.1.0" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| resources: | ||
| - manager.yaml | ||
| - controller_manager_telemetry.yaml | ||
|
|
||
| generatorOptions: | ||
| disableNameSuffixHash: true | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,115 @@ | ||
| /* | ||
| Copyright (c) 2026 Splunk Inc. All rights reserved. | ||
|
|
||
| Licensed under the Apache License, Version 2.0 (the "License"); | ||
| you may not use this file except in compliance with the License. | ||
| You may obtain a copy of the License at | ||
|
|
||
| http://www.apache.org/licenses/LICENSE-2.0 | ||
|
|
||
| Unless required by applicable law or agreed to in writing, software | ||
| distributed under the License is distributed on an "AS IS" BASIS, | ||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| See the License for the specific language governing permissions and | ||
| limitations under the License. | ||
| */ | ||
| package controller | ||
|
|
||
| import ( | ||
| "context" | ||
| "fmt" | ||
| enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" | ||
| ctrl "sigs.k8s.io/controller-runtime" | ||
| "time" | ||
|
|
||
| metrics "github.com/splunk/splunk-operator/pkg/splunk/client/metrics" | ||
|
|
||
| corev1 "k8s.io/api/core/v1" | ||
| k8serrors "k8s.io/apimachinery/pkg/api/errors" | ||
| "k8s.io/apimachinery/pkg/runtime" | ||
|
|
||
| "sigs.k8s.io/controller-runtime/pkg/client" | ||
| "sigs.k8s.io/controller-runtime/pkg/controller" | ||
| "sigs.k8s.io/controller-runtime/pkg/log" | ||
| "sigs.k8s.io/controller-runtime/pkg/predicate" | ||
| ) | ||
|
|
||
| const ( | ||
| // Below two contants are defined at kustomizatio*.yaml | ||
| ConfigMapNamePrefix = "splunk-operator-" | ||
| ConfigMapLabelName = "splunk-operator" | ||
|
|
||
| telemetryRetryDelay = time.Second * 600 | ||
| ) | ||
|
|
||
| var applyTelemetryFn = enterprise.ApplyTelemetry | ||
|
|
||
| type TelemetryReconciler struct { | ||
| client.Client | ||
| Scheme *runtime.Scheme | ||
| } | ||
|
|
||
| //+kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;watch | ||
|
|
||
| func (r *TelemetryReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { | ||
| metrics.ReconcileCounters.With(metrics.GetPrometheusLabels(req, "Telemetry")).Inc() | ||
| defer recordInstrumentionData(time.Now(), req, "controller", "Telemetry") | ||
|
|
||
| reqLogger := log.FromContext(ctx) | ||
| reqLogger = reqLogger.WithValues("telemetry", req.NamespacedName) | ||
|
|
||
| reqLogger.Info("Reconciling telemetry") | ||
|
|
||
| defer func() { | ||
| if rec := recover(); rec != nil { | ||
| reqLogger.Error(fmt.Errorf("panic: %v", rec), "Recovered from panic in TelemetryReconciler.Reconcile") | ||
| } | ||
| }() | ||
|
|
||
| // Fetch the ConfigMap | ||
| cm := &corev1.ConfigMap{} | ||
| err := r.Get(ctx, req.NamespacedName, cm) | ||
| if err != nil { | ||
| if k8serrors.IsNotFound(err) { | ||
| reqLogger.Info("telemetry configmap not found; requeueing", "period(seconds)", int(telemetryRetryDelay/time.Second)) | ||
| return ctrl.Result{Requeue: true, RequeueAfter: telemetryRetryDelay}, nil | ||
| } | ||
| reqLogger.Error(err, "could not load telemetry configmap; requeueing", "period(seconds)", int(telemetryRetryDelay/time.Second)) | ||
| return ctrl.Result{Requeue: true, RequeueAfter: telemetryRetryDelay}, nil | ||
| } | ||
|
|
||
| if len(cm.Data) == 0 { | ||
| reqLogger.Info("telemetry configmap has no data keys") | ||
| return ctrl.Result{Requeue: true, RequeueAfter: telemetryRetryDelay}, nil | ||
| } | ||
|
|
||
| reqLogger.Info("start", "Telemetry configmap version", cm.GetResourceVersion()) | ||
|
|
||
| result, err := applyTelemetryFn(ctx, r.Client, cm) | ||
| if err != nil { | ||
| reqLogger.Error(err, "Failed to send telemetry") | ||
| return ctrl.Result{Requeue: true, RequeueAfter: telemetryRetryDelay}, nil | ||
| } | ||
| if result.Requeue && result.RequeueAfter != 0 { | ||
| reqLogger.Info("Requeued", "period(seconds)", int(result.RequeueAfter/time.Second)) | ||
| } | ||
|
|
||
| return result, err | ||
| } | ||
|
|
||
| // SetupWithManager sets up the controller with the Manager. | ||
| func (r *TelemetryReconciler) SetupWithManager(mgr ctrl.Manager) error { | ||
| return ctrl.NewControllerManagedBy(mgr). | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should you be watching for CR resource creation and process them only when new CR is created
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you implement an event-driven approach where the telemetry controller watches the actual Splunk custom resources and only triggers reconciliation when:
Benefits of This Approach1. Reduced Resource Consumption
2. Immediate Response
3. Better Alignment with Kubernetes Best Practices
4. Clearer Intent
Proposed Implementation ChangesCurrent Setup (from SetupWithManager):func (r *TelemetryReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
For(&corev1.ConfigMap{}). // Watching ConfigMaps
WithEventFilter(predicate.Funcs{
CreateFunc: func(e event.CreateEvent) bool {
return r.isTelemetryConfigMap(e.Object)
},
// ... more predicates
}).
WithOptions(controller.Options{
MaxConcurrentReconciles: 1,
}).
Complete(r)
}Suggested Alternative:func (r *TelemetryReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
// Watch Splunk CRs directly
For(&enterprisev4.Standalone{}).
Owns(&enterprisev4.ClusterMaster{}).
Owns(&enterprisev4.IndexerCluster{}).
Owns(&enterprisev4.SearchHeadCluster{}).
// ... other Splunk CRs
WithEventFilter(predicate.Funcs{
CreateFunc: func(e event.CreateEvent) bool {
// Trigger on CR creation
return true
},
UpdateFunc: func(e event.UpdateEvent) bool {
// Optionally trigger on significant updates
return shouldCollectTelemetry(e.ObjectOld, e.ObjectNew)
},
DeleteFunc: func(e event.DeleteEvent) bool {
// Optionally track deletions
return false
},
}).
WithOptions(controller.Options{
MaxConcurrentReconciles: 1,
}).
Complete(r)
}Modified Reconcile Method:func (r *TelemetryReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
log := r.Log.WithValues("telemetry", req.NamespacedName)
// Fetch the actual Splunk CR that triggered this reconciliation
// Determine CR type and get relevant telemetry data
// Collect telemetry for THIS specific resource
telemetryData := r.collectResourceTelemetry(ctx, req)
// Send telemetry immediately (no requeue needed!)
if err := r.applyTelemetryFn(ctx, telemetryData); err != nil {
log.Error(err, "Failed to send telemetry")
// Only requeue on actual errors, not as a periodic timer
return ctrl.Result{Requeue: true}, err
}
// Done! No automatic requeue
return ctrl.Result{}, nil
}Additional Considerations1. Rate LimitingIf watching CRs directly, consider:
2. Daily Telemetry RequirementThe PR mentions "collecting and sending telemetry data once per day". If this is the actual requirement: Option A: Use a CronJob instead of a controller apiVersion: batch/v1
kind: CronJob
metadata:
name: splunk-operator-telemetry
spec:
schedule: "0 2 * * *" # Daily at 2 AM
jobTemplate:
spec:
template:
spec:
containers:
- name: telemetry-collector
# Collect and send telemetryOption B: If controller is needed, add timestamp-based logic: // Check last telemetry send time
lastSent := getLastTelemetrySendTime()
if time.Since(lastSent) < 24*time.Hour {
// Skip telemetry, already sent today
return ctrl.Result{}, nil
} |
||
| For(&corev1.ConfigMap{}). | ||
| WithEventFilter(predicate.NewPredicateFuncs(func(obj client.Object) bool { | ||
| labels := obj.GetLabels() | ||
| if labels == nil { | ||
| return false | ||
| } | ||
| return obj.GetName() == enterprise.GetTelemetryConfigMapName(ConfigMapNamePrefix) && labels["name"] == ConfigMapLabelName | ||
| })). | ||
| WithOptions(controller.Options{ | ||
| MaxConcurrentReconciles: 1, | ||
| }). | ||
| Complete(r) | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
code has 47% test coverage lets try to move to 90%
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have added more tests.