Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/pre-release-workflow.yml
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

code has 47% test coverage lets try to move to 90%

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have added more tests.

Original file line number Diff line number Diff line change
Expand Up @@ -249,4 +249,4 @@ jobs:
body: |
### Automated Pull Request for Splunk Operator Release ${{ github.event.inputs.release_version }}
* Changes added to docs/ChangeLog-NEW.md. Please filter and update ChangeLog.md
* Delete ChangeLog-New.md
* Delete ChangeLog-New.md
6 changes: 5 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ deploy: manifests kustomize uninstall ## Deploy controller to the K8s cluster sp
$(SED) "s/value: WATCH_NAMESPACE_VALUE/value: \"${WATCH_NAMESPACE}\"/g" config/${ENVIRONMENT}/kustomization.yaml
$(SED) "s|SPLUNK_ENTERPRISE_IMAGE|${SPLUNK_ENTERPRISE_IMAGE}|g" config/${ENVIRONMENT}/kustomization.yaml
$(SED) "s/value: SPLUNK_GENERAL_TERMS_VALUE/value: \"${SPLUNK_GENERAL_TERMS}\"/g" config/${ENVIRONMENT}/kustomization.yaml
$(SED) 's/\("sokVersion": \)"[^"]*"/\1"$(VERSION)"/' config/manager/controller_manager_telemetry.yaml
cd config/manager && $(KUSTOMIZE) edit set image controller=${IMG}
RELATED_IMAGE_SPLUNK_ENTERPRISE=${SPLUNK_ENTERPRISE_IMAGE} WATCH_NAMESPACE=${WATCH_NAMESPACE} SPLUNK_GENERAL_TERMS=${SPLUNK_GENERAL_TERMS} $(KUSTOMIZE) build config/${ENVIRONMENT} | kubectl apply --server-side --force-conflicts -f -
$(SED) "s/namespace: ${NAMESPACE}/namespace: splunk-operator/g" config/${ENVIRONMENT}/kustomization.yaml
Expand Down Expand Up @@ -395,6 +396,7 @@ run_clair_scan:

# generate artifacts needed to deploy operator, this is current way of doing it, need to fix this
generate-artifacts-namespace: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config.
$(SED) 's/\("sokVersion": \)"[^"]*"/\1"$(VERSION)"/' config/manager/controller_manager_telemetry.yaml
mkdir -p release-${VERSION}
cp config/default/kustomization-namespace.yaml config/default/kustomization.yaml
cp config/rbac/kustomization-namespace.yaml config/rbac/kustomization.yaml
Expand All @@ -410,6 +412,7 @@ generate-artifacts-namespace: manifests kustomize ## Deploy controller to the K8

# generate artifacts needed to deploy operator, this is current way of doing it, need to fix this
generate-artifacts-cluster: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config.
$(SED) 's/\("sokVersion": \)"[^"]*"/\1"$(VERSION)"/' config/manager/controller_manager_telemetry.yaml
mkdir -p release-${VERSION}
cp config/default/kustomization-cluster.yaml config/default/kustomization.yaml
cp config/rbac/kustomization-cluster.yaml config/rbac/kustomization.yaml
Expand Down Expand Up @@ -469,4 +472,5 @@ setup/ginkgo:
build-installer: manifests generate kustomize
mkdir -p dist
cd config/manager && $(KUSTOMIZE) edit set image controller=${IMG}
$(KUSTOMIZE) build config/default > dist/install.yaml
$(KUSTOMIZE) build config/default > dist/install.yaml

7 changes: 7 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,13 @@ func main() {
setupLog.Error(err, "unable to create controller", "controller", "Standalone")
os.Exit(1)
}
if err = (&intController.TelemetryReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "Telemetry")
os.Exit(1)
}

// Setup centralized validation webhook server (opt-in via ENABLE_VALIDATION_WEBHOOK env var, defaults to false)
enableWebhooks := os.Getenv("ENABLE_VALIDATION_WEBHOOK")
Expand Down
11 changes: 11 additions & 0 deletions config/manager/controller_manager_telemetry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: manager-telemetry
data:
status: |
{
"lastTransmission": "",
"test": "false",
"sokVersion": "3.1.0"
}
1 change: 1 addition & 0 deletions config/manager/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
resources:
- manager.yaml
- controller_manager_telemetry.yaml

generatorOptions:
disableNameSuffixHash: true
Expand Down
115 changes: 115 additions & 0 deletions internal/controller/telemetry_controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/*
Copyright (c) 2026 Splunk Inc. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package controller

import (
"context"
"fmt"
enterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise"
ctrl "sigs.k8s.io/controller-runtime"
"time"

metrics "github.com/splunk/splunk-operator/pkg/splunk/client/metrics"

corev1 "k8s.io/api/core/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"

"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"
)

const (
// Below two contants are defined at kustomizatio*.yaml
ConfigMapNamePrefix = "splunk-operator-"
ConfigMapLabelName = "splunk-operator"

telemetryRetryDelay = time.Second * 600
)

var applyTelemetryFn = enterprise.ApplyTelemetry

type TelemetryReconciler struct {
client.Client
Scheme *runtime.Scheme
}

//+kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;watch

func (r *TelemetryReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
metrics.ReconcileCounters.With(metrics.GetPrometheusLabels(req, "Telemetry")).Inc()
defer recordInstrumentionData(time.Now(), req, "controller", "Telemetry")

reqLogger := log.FromContext(ctx)
reqLogger = reqLogger.WithValues("telemetry", req.NamespacedName)

reqLogger.Info("Reconciling telemetry")

defer func() {
if rec := recover(); rec != nil {
reqLogger.Error(fmt.Errorf("panic: %v", rec), "Recovered from panic in TelemetryReconciler.Reconcile")
}
}()

// Fetch the ConfigMap
cm := &corev1.ConfigMap{}
err := r.Get(ctx, req.NamespacedName, cm)
if err != nil {
if k8serrors.IsNotFound(err) {
reqLogger.Info("telemetry configmap not found; requeueing", "period(seconds)", int(telemetryRetryDelay/time.Second))
return ctrl.Result{Requeue: true, RequeueAfter: telemetryRetryDelay}, nil
}
reqLogger.Error(err, "could not load telemetry configmap; requeueing", "period(seconds)", int(telemetryRetryDelay/time.Second))
return ctrl.Result{Requeue: true, RequeueAfter: telemetryRetryDelay}, nil
}

if len(cm.Data) == 0 {
reqLogger.Info("telemetry configmap has no data keys")
return ctrl.Result{Requeue: true, RequeueAfter: telemetryRetryDelay}, nil
}

reqLogger.Info("start", "Telemetry configmap version", cm.GetResourceVersion())

result, err := applyTelemetryFn(ctx, r.Client, cm)
if err != nil {
reqLogger.Error(err, "Failed to send telemetry")
return ctrl.Result{Requeue: true, RequeueAfter: telemetryRetryDelay}, nil
}
if result.Requeue && result.RequeueAfter != 0 {
reqLogger.Info("Requeued", "period(seconds)", int(result.RequeueAfter/time.Second))
}

return result, err
}

// SetupWithManager sets up the controller with the Manager.
func (r *TelemetryReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should you be watching for CR resource creation and process them only when new CR is created

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you implement an event-driven approach where the telemetry controller watches the actual Splunk custom resources and only triggers reconciliation when:

  1. A new CR is created (Standalone, ClusterMaster, IndexerCluster, SearchHeadCluster, etc.)
  2. An existing CR is modified (configuration changes, scaling events)
  3. A CR is deleted (to track removal events)

Benefits of This Approach

1. Reduced Resource Consumption

  • No periodic reconciliation when nothing has changed
  • CPU and memory usage only when actual events occur
  • More efficient for clusters with stable configurations

2. Immediate Response

  • Telemetry collected immediately when CRs are created/modified
  • No waiting for the next 10-minute requeue cycle
  • More accurate timestamps for resource creation events

3. Better Alignment with Kubernetes Best Practices

  • Controllers should react to resource changes, not poll
  • Leverages Kubernetes watch mechanism efficiently
  • Reduces unnecessary API server load

4. Clearer Intent

  • The controller's purpose becomes explicit: "Send telemetry when Splunk resources change"
  • Easier to understand and maintain
  • Better for debugging (logs show which CR triggered telemetry)

Proposed Implementation Changes

Current Setup (from SetupWithManager):

func (r *TelemetryReconciler) SetupWithManager(mgr ctrl.Manager) error {
    return ctrl.NewControllerManagedBy(mgr).
        For(&corev1.ConfigMap{}).  // Watching ConfigMaps
        WithEventFilter(predicate.Funcs{
            CreateFunc: func(e event.CreateEvent) bool {
                return r.isTelemetryConfigMap(e.Object)
            },
            // ... more predicates
        }).
        WithOptions(controller.Options{
            MaxConcurrentReconciles: 1,
        }).
        Complete(r)
}

Suggested Alternative:

func (r *TelemetryReconciler) SetupWithManager(mgr ctrl.Manager) error {
    return ctrl.NewControllerManagedBy(mgr).
        // Watch Splunk CRs directly
        For(&enterprisev4.Standalone{}).
        Owns(&enterprisev4.ClusterMaster{}).
        Owns(&enterprisev4.IndexerCluster{}).
        Owns(&enterprisev4.SearchHeadCluster{}).
        // ... other Splunk CRs
        WithEventFilter(predicate.Funcs{
            CreateFunc: func(e event.CreateEvent) bool {
                // Trigger on CR creation
                return true
            },
            UpdateFunc: func(e event.UpdateEvent) bool {
                // Optionally trigger on significant updates
                return shouldCollectTelemetry(e.ObjectOld, e.ObjectNew)
            },
            DeleteFunc: func(e event.DeleteEvent) bool {
                // Optionally track deletions
                return false
            },
        }).
        WithOptions(controller.Options{
            MaxConcurrentReconciles: 1,
        }).
        Complete(r)
}

Modified Reconcile Method:

func (r *TelemetryReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
    log := r.Log.WithValues("telemetry", req.NamespacedName)

    // Fetch the actual Splunk CR that triggered this reconciliation
    // Determine CR type and get relevant telemetry data

    // Collect telemetry for THIS specific resource
    telemetryData := r.collectResourceTelemetry(ctx, req)

    // Send telemetry immediately (no requeue needed!)
    if err := r.applyTelemetryFn(ctx, telemetryData); err != nil {
        log.Error(err, "Failed to send telemetry")
        // Only requeue on actual errors, not as a periodic timer
        return ctrl.Result{Requeue: true}, err
    }

    // Done! No automatic requeue
    return ctrl.Result{}, nil
}

Additional Considerations

1. Rate Limiting

If watching CRs directly, consider:

  • Implementing rate limiting to avoid telemetry spam
  • Batching multiple CR events within a time window
  • Using a "debounce" mechanism for rapid successive changes

2. Daily Telemetry Requirement

The PR mentions "collecting and sending telemetry data once per day". If this is the actual requirement:

Option A: Use a CronJob instead of a controller

apiVersion: batch/v1
kind: CronJob
metadata:
  name: splunk-operator-telemetry
spec:
  schedule: "0 2 * * *"  # Daily at 2 AM
  jobTemplate:
    spec:
      template:
        spec:
          containers:
          - name: telemetry-collector
            # Collect and send telemetry

Option B: If controller is needed, add timestamp-based logic:

// Check last telemetry send time
lastSent := getLastTelemetrySendTime()
if time.Since(lastSent) < 24*time.Hour {
    // Skip telemetry, already sent today
    return ctrl.Result{}, nil
}

For(&corev1.ConfigMap{}).
WithEventFilter(predicate.NewPredicateFuncs(func(obj client.Object) bool {
labels := obj.GetLabels()
if labels == nil {
return false
}
return obj.GetName() == enterprise.GetTelemetryConfigMapName(ConfigMapNamePrefix) && labels["name"] == ConfigMapLabelName
})).
WithOptions(controller.Options{
MaxConcurrentReconciles: 1,
}).
Complete(r)
}
Loading
Loading