From 586728d365c6688241e03bffde95b5fb77c347d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Wed, 4 Feb 2026 17:07:35 +0100 Subject: [PATCH 01/25] feat: observability with otel and default grafana dashboard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- observability/install-observability.sh | 240 +++++++++++++++++++++++++ 1 file changed, 240 insertions(+) create mode 100755 observability/install-observability.sh diff --git a/observability/install-observability.sh b/observability/install-observability.sh new file mode 100755 index 0000000000..2db117e0cd --- /dev/null +++ b/observability/install-observability.sh @@ -0,0 +1,240 @@ +#!/bin/bash + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo -e "${GREEN}========================================${NC}" +echo -e "${GREEN}Installing Observability Stack${NC}" +echo -e "${GREEN}OpenTelemetry + Prometheus + Grafana${NC}" +echo -e "${GREEN}========================================${NC}" + +# Check if helm is installed +echo -e "\n${YELLOW}Checking helm installation...${NC}" +if ! command -v helm &> /dev/null; then + echo -e "${RED}Error: helm is not installed${NC}" + echo "Please install helm: https://helm.sh/docs/intro/install/" + exit 1 +fi +echo -e "${GREEN}✓ helm is installed${NC}" + +# Add Helm repositories +echo -e "\n${YELLOW}Adding Helm repositories...${NC}" +helm repo add jetstack https://charts.jetstack.io +helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +echo -e "${GREEN}✓ Helm repositories added${NC}" + +# Install cert-manager (required for OpenTelemetry Operator) +echo -e "\n${YELLOW}Installing cert-manager...${NC}" +if kubectl get namespace cert-manager > /dev/null 2>&1; then + echo -e "${YELLOW}cert-manager namespace already exists, skipping...${NC}" +else + kubectl create namespace cert-manager + helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --set crds.enabled=true \ + --wait + echo -e "${GREEN}✓ cert-manager installed${NC}" +fi + +# Create observability namespace +echo -e "\n${YELLOW}Creating observability namespace...${NC}" +kubectl create namespace observability --dry-run=client -o yaml | kubectl apply -f - +echo -e "${GREEN}✓ observability namespace ready${NC}" + +# Install OpenTelemetry Operator +echo -e "\n${YELLOW}Installing OpenTelemetry Operator...${NC}" +if helm list -n observability | grep -q opentelemetry-operator; then + echo -e "${YELLOW}OpenTelemetry Operator already installed, upgrading...${NC}" + helm upgrade opentelemetry-operator open-telemetry/opentelemetry-operator \ + --namespace observability \ + --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" \ + --wait +else + helm install opentelemetry-operator open-telemetry/opentelemetry-operator \ + --namespace observability \ + --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" \ + --wait +fi +echo -e "${GREEN}✓ OpenTelemetry Operator installed${NC}" + +# Install kube-prometheus-stack (includes Prometheus + Grafana) +echo -e "\n${YELLOW}Installing Prometheus and Grafana stack...${NC}" +if helm list -n observability | grep -q kube-prometheus-stack; then + echo -e "${YELLOW}kube-prometheus-stack already installed, upgrading...${NC}" + helm upgrade kube-prometheus-stack prometheus-community/kube-prometheus-stack \ + --namespace observability \ + --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ + --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \ + --set grafana.adminPassword=admin \ + --wait +else + helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack \ + --namespace observability \ + --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ + --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \ + --set grafana.adminPassword=admin \ + --wait +fi +echo -e "${GREEN}✓ Prometheus and Grafana installed${NC}" + +# Create OpenTelemetry Collector instance +echo -e "\n${YELLOW}Creating OpenTelemetry Collector...${NC}" +cat < Date: Wed, 4 Feb 2026 17:28:34 +0100 Subject: [PATCH 02/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- observability/install-observability.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/observability/install-observability.sh b/observability/install-observability.sh index 2db117e0cd..e9c42f5968 100755 --- a/observability/install-observability.sh +++ b/observability/install-observability.sh @@ -50,6 +50,7 @@ echo -e "${GREEN}✓ observability namespace ready${NC}" # Install OpenTelemetry Operator echo -e "\n${YELLOW}Installing OpenTelemetry Operator...${NC}" + if helm list -n observability | grep -q opentelemetry-operator; then echo -e "${YELLOW}OpenTelemetry Operator already installed, upgrading...${NC}" helm upgrade opentelemetry-operator open-telemetry/opentelemetry-operator \ From afab176eb9cc79bb6b84faa116316f2207782ef8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Wed, 4 Feb 2026 17:54:09 +0100 Subject: [PATCH 03/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- observability/install-observability.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/observability/install-observability.sh b/observability/install-observability.sh index e9c42f5968..314ee1e4aa 100755 --- a/observability/install-observability.sh +++ b/observability/install-observability.sh @@ -1,4 +1,19 @@ #!/bin/bash +# +# Copyright Java Operator SDK Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# set -e From b0af1e214c010956d9a185ce354cd8649986b5c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Wed, 4 Feb 2026 18:05:06 +0100 Subject: [PATCH 04/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- .github/workflows/pr.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 7a5964ba35..34bc6c7d0f 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -11,6 +11,7 @@ on: paths-ignore: - 'docs/**' - 'adr/**' + - 'observability/**' workflow_dispatch: jobs: check_format_and_unit_tests: From edd24fe38f1fdc9bd5789530db4813c820e41863 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Sun, 8 Feb 2026 16:17:46 +0100 Subject: [PATCH 05/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- grafana/README.md | 225 +++++++++++++++ grafana/install-observability.sh | 264 ++++++++++++++++++ sample-operators/webpage/pom.xml | 14 + .../operator/sample/WebPageOperator.java | 49 +++- .../src/main/resources/otlp-config.yaml | 6 + 5 files changed, 557 insertions(+), 1 deletion(-) create mode 100644 grafana/README.md create mode 100755 grafana/install-observability.sh create mode 100644 sample-operators/webpage/src/main/resources/otlp-config.yaml diff --git a/grafana/README.md b/grafana/README.md new file mode 100644 index 0000000000..35e1167190 --- /dev/null +++ b/grafana/README.md @@ -0,0 +1,225 @@ +# Observability Stack for Java Operator SDK + +This directory contains scripts and configuration for setting up a complete observability stack on minikube. + +## Quick Start + +```bash +./install-observability.sh +``` + +This script installs: +- **OpenTelemetry Operator** - For collecting metrics and traces +- **Prometheus** - For metrics storage and querying +- **Grafana** - For visualization and dashboards +- **cert-manager** - Required for OpenTelemetry Operator webhooks + +## Prerequisites + +- kubectl configured +- Helm 3.x installed + +## Components Installed + +### OpenTelemetry Collector +- Receives metrics and traces via OTLP (gRPC and HTTP) +- Exports metrics to Prometheus format +- Configured with memory limiter and batch processing + +**Endpoints:** +- OTLP gRPC: `otel-collector-collector.observability.svc.cluster.local:4317` +- OTLP HTTP: `otel-collector-collector.observability.svc.cluster.local:4318` +- Prometheus metrics: `http://otel-collector-prometheus.observability.svc.cluster.local:8889/metrics` + +### Prometheus +- Scrapes metrics from OpenTelemetry Collector +- Supports ServiceMonitor and PodMonitor CRDs +- Configured to discover all metrics automatically + +**Access:** +```bash +kubectl port-forward -n observability svc/kube-prometheus-stack-prometheus 9090:9090 +``` +Open http://localhost:9090 + +### Grafana +- Pre-configured with Prometheus as data source +- Includes Kubernetes monitoring dashboards + +**Access:** +```bash +kubectl port-forward -n observability svc/kube-prometheus-stack-grafana 3000:80 +``` +Open http://localhost:3000 +- **Username:** admin +- **Password:** admin + +## Integrating with Your Operator + +### 1. Add OpenTelemetry Dependency + +Add to your `pom.xml`: + +```xml + + io.javaoperatorsdk + operator-framework-opentelemetry-support + ${josdk.version} + +``` + +### 2. Configure OpenTelemetry in Your Operator + +In your operator code: + +```java +import io.javaoperatorsdk.operator.monitoring.opentelemetry.OpenTelemetryMetrics; +import io.opentelemetry.api.OpenTelemetry; +import io.opentelemetry.sdk.autoconfigure.AutoConfiguredOpenTelemetrySdk; + +// Initialize OpenTelemetry +OpenTelemetry openTelemetry = AutoConfiguredOpenTelemetrySdk.initialize() + .getOpenTelemetrySdk(); + +// Create JOSDK metrics instance +Metrics metrics = OpenTelemetryMetrics.builder(openTelemetry) + .build(); + +// Configure operator with metrics +Operator operator = new Operator(client, o -> o.withMetrics(metrics)); +``` + +### 3. Set Environment Variables + +In your operator deployment YAML: + +```yaml +env: + - name: OTEL_SERVICE_NAME + value: "your-operator-name" + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: "http://otel-collector-collector.observability.svc.cluster.local:4318" + - name: OTEL_METRICS_EXPORTER + value: "otlp" + - name: OTEL_TRACES_EXPORTER + value: "otlp" + - name: OTEL_EXPORTER_OTLP_PROTOCOL + value: "http/protobuf" +``` + +## Available JOSDK Metrics + +The following metrics are exported by JOSDK: + +| Metric | Type | Description | +|--------|------|-------------| +| `operator_sdk_reconciliations_started_total` | Counter | Total number of reconciliations started | +| `operator_sdk_reconciliations_success_total` | Counter | Total number of successful reconciliations | +| `operator_sdk_reconciliations_failed_total` | Counter | Total number of failed reconciliations | +| `operator_sdk_reconciliations_queue_size` | Gauge | Current reconciliation queue size | +| `operator_sdk_events_received_total` | Counter | Total number of Kubernetes events received | +| `operator_sdk_controllers_execution_reconcile_seconds` | Timer | Time taken for reconciliations | +| `operator_sdk_controllers_execution_cleanup_seconds` | Timer | Time taken for cleanup operations | + +## Creating Grafana Dashboards + +### Example PromQL Queries + +**Reconciliation Rate:** +```promql +sum(rate(operator_sdk_reconciliations_started_total[5m])) by (controller) +``` + +**Success Rate:** +```promql +sum(rate(operator_sdk_reconciliations_success_total[5m])) / +sum(rate(operator_sdk_reconciliations_started_total[5m])) +``` + +**Error Rate:** +```promql +sum(rate(operator_sdk_reconciliations_failed_total[5m])) by (controller, exception) +``` + +**Queue Size:** +```promql +operator_sdk_reconciliations_queue_size +``` + +**Average Reconciliation Duration:** +```promql +rate(operator_sdk_controllers_execution_reconcile_seconds_sum[5m]) / +rate(operator_sdk_controllers_execution_reconcile_seconds_count[5m]) +``` + +### Sample Dashboard Configuration + +1. Open Grafana (http://localhost:3000) +2. Go to "Dashboards" → "New Dashboard" +3. Add panels with the PromQL queries above +4. Configure visualization types: + - Time series for rates and durations + - Gauge for queue size + - Stat for current values + +## Troubleshooting + +### Check Pod Status +```bash +kubectl get pods -n observability +``` + +### Check OpenTelemetry Collector Logs +```bash +kubectl logs -n observability -l app.kubernetes.io/name=otel-collector -f +``` + +### Check Prometheus Targets +```bash +kubectl port-forward -n observability svc/kube-prometheus-stack-prometheus 9090:9090 +``` +Then open http://localhost:9090/targets + +### Verify Metrics are Being Collected +```bash +# Check if OpenTelemetry is receiving metrics +kubectl port-forward -n observability svc/otel-collector-prometheus 8889:8889 +curl http://localhost:8889/metrics | grep operator_sdk +``` + +### Test OTLP Endpoint +```bash +# Port forward the OTLP HTTP endpoint +kubectl port-forward -n observability svc/otel-collector-collector 4318:4318 + +# Send a test metric (requires curl and valid OTLP JSON) +# This is just for testing connectivity +curl -X POST http://localhost:4318/v1/metrics \ + -H "Content-Type: application/json" \ + -d '{"resourceMetrics":[]}' +``` + +## Uninstalling + +To remove all components: + +```bash +# Delete OpenTelemetry resources +kubectl delete -n observability OpenTelemetryCollector otel-collector + +# Uninstall Helm releases +helm uninstall -n observability kube-prometheus-stack +helm uninstall -n observability opentelemetry-operator +helm uninstall -n cert-manager cert-manager + +# Delete namespaces +kubectl delete namespace observability cert-manager +``` + +## References + +- [JOSDK Observability Documentation](https://javaoperatorsdk.io/docs/documentation/observability/) +- [OpenTelemetry Java Documentation](https://opentelemetry.io/docs/instrumentation/java/) +- [Prometheus Operator](https://github.com/prometheus-operator/prometheus-operator) +- [Grafana Documentation](https://grafana.com/docs/) +- [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/) diff --git a/grafana/install-observability.sh b/grafana/install-observability.sh new file mode 100755 index 0000000000..63bdcb706f --- /dev/null +++ b/grafana/install-observability.sh @@ -0,0 +1,264 @@ +#!/bin/bash +# +# Copyright Java Operator SDK Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo -e "${GREEN}========================================${NC}" +echo -e "${GREEN}Installing Observability Stack${NC}" +echo -e "${GREEN}OpenTelemetry + Prometheus + Grafana${NC}" +echo -e "${GREEN}========================================${NC}" + +# Check if minikube is running +echo -e "\n${YELLOW}Checking minikube status...${NC}" +if ! minikube status > /dev/null 2>&1; then + echo -e "${RED}Error: minikube is not running${NC}" + echo "Please start minikube with: minikube start" + exit 1 +fi +echo -e "${GREEN}✓ minikube is running${NC}" + +# Check if helm is installed +echo -e "\n${YELLOW}Checking helm installation...${NC}" +if ! command -v helm &> /dev/null; then + echo -e "${RED}Error: helm is not installed${NC}" + echo "Please install helm: https://helm.sh/docs/intro/install/" + exit 1 +fi +echo -e "${GREEN}✓ helm is installed${NC}" + +# Add Helm repositories +echo -e "\n${YELLOW}Adding Helm repositories...${NC}" +helm repo add jetstack https://charts.jetstack.io +helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +echo -e "${GREEN}✓ Helm repositories added${NC}" + +# Install cert-manager (required for OpenTelemetry Operator) +echo -e "\n${YELLOW}Installing cert-manager...${NC}" +if kubectl get namespace cert-manager > /dev/null 2>&1; then + echo -e "${YELLOW}cert-manager namespace already exists, skipping...${NC}" +else + kubectl create namespace cert-manager + helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --set crds.enabled=true \ + --wait + echo -e "${GREEN}✓ cert-manager installed${NC}" +fi + +# Create observability namespace +echo -e "\n${YELLOW}Creating observability namespace...${NC}" +kubectl create namespace observability --dry-run=client -o yaml | kubectl apply -f - +echo -e "${GREEN}✓ observability namespace ready${NC}" + +# Install OpenTelemetry Operator +echo -e "\n${YELLOW}Installing OpenTelemetry Operator...${NC}" +if helm list -n observability | grep -q opentelemetry-operator; then + echo -e "${YELLOW}OpenTelemetry Operator already installed, upgrading...${NC}" + helm upgrade opentelemetry-operator open-telemetry/opentelemetry-operator \ + --namespace observability \ + --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" \ + --wait +else + helm install opentelemetry-operator open-telemetry/opentelemetry-operator \ + --namespace observability \ + --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" \ + --wait +fi +echo -e "${GREEN}✓ OpenTelemetry Operator installed${NC}" + +# Install kube-prometheus-stack (includes Prometheus + Grafana) +echo -e "\n${YELLOW}Installing Prometheus and Grafana stack...${NC}" +if helm list -n observability | grep -q kube-prometheus-stack; then + echo -e "${YELLOW}kube-prometheus-stack already installed, upgrading...${NC}" + helm upgrade kube-prometheus-stack prometheus-community/kube-prometheus-stack \ + --namespace observability \ + --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ + --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \ + --set grafana.adminPassword=admin \ + --wait +else + helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack \ + --namespace observability \ + --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ + --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \ + --set grafana.adminPassword=admin \ + --wait +fi +echo -e "${GREEN}✓ Prometheus and Grafana installed${NC}" + +# Create OpenTelemetry Collector instance +echo -e "\n${YELLOW}Creating OpenTelemetry Collector...${NC}" +kubectl apply -f - <io.javaoperatorsdk operator-framework + + io.javaoperatorsdk + micrometer-support + + + io.micrometer + micrometer-registry-otlp + ${micrometer-core.version} + + + org.yaml + snakeyaml + 2.3 + org.apache.logging.log4j log4j-slf4j2-impl diff --git a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java index 5366dc2e9a..78c05f8df7 100644 --- a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java +++ b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java @@ -16,14 +16,25 @@ package io.javaoperatorsdk.operator.sample; import java.io.IOException; +import java.io.InputStream; import java.net.InetSocketAddress; +import java.util.HashMap; +import java.util.Map; +import org.jspecify.annotations.NonNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.yaml.snakeyaml.Yaml; import io.javaoperatorsdk.operator.Operator; +import io.javaoperatorsdk.operator.api.monitoring.Metrics; +import io.javaoperatorsdk.operator.monitoring.micrometer.MicrometerMetrics; import io.javaoperatorsdk.operator.sample.probes.LivenessHandler; import io.javaoperatorsdk.operator.sample.probes.StartupHandler; +import io.micrometer.core.instrument.Clock; +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.registry.otlp.OtlpConfig; +import io.micrometer.registry.otlp.OtlpMeterRegistry; import com.sun.net.httpserver.HttpServer; @@ -40,7 +51,10 @@ public class WebPageOperator { public static void main(String[] args) throws IOException { log.info("WebServer Operator starting!"); - Operator operator = new Operator(o -> o.withStopOnInformerErrorDuringStartup(false)); + // Load configuration from config.yaml + Metrics metrics = initOTLPMetrics(); + Operator operator = + new Operator(o -> o.withStopOnInformerErrorDuringStartup(false).withMetrics(metrics)); String reconcilerEnvVar = System.getenv(WEBPAGE_RECONCILER_ENV); if (WEBPAGE_CLASSIC_RECONCILER_ENV_VALUE.equals(reconcilerEnvVar)) { operator.register(new WebPageReconciler()); @@ -58,4 +72,37 @@ public static void main(String[] args) throws IOException { server.setExecutor(null); server.start(); } + + private static @NonNull Metrics initOTLPMetrics() { + Map configProperties = loadConfigFromYaml(); + OtlpConfig otlpConfig = configProperties::get; + + MeterRegistry registry = new OtlpMeterRegistry(otlpConfig, Clock.SYSTEM); + return MicrometerMetrics.withoutPerResourceMetrics(registry); + } + + @SuppressWarnings("unchecked") + private static Map loadConfigFromYaml() { + Map configMap = new HashMap<>(); + try (InputStream inputStream = WebPageOperator.class.getResourceAsStream("/otlp-config.yaml")) { + if (inputStream == null) { + log.warn("otlp-config.yaml not found in resources, using default OTLP configuration"); + return configMap; + } + + Yaml yaml = new Yaml(); + Map yamlData = yaml.load(inputStream); + + // Navigate to otlp section and map properties directly + Map otlp = (Map) yamlData.get("otlp"); + if (otlp != null) { + otlp.forEach((key, value) -> configMap.put("otlp." + key, value.toString())); + } + + log.info("Loaded OTLP configuration from otlp-config.yaml: {}", configMap); + } catch (IOException e) { + log.error("Error loading otlp-config.yaml", e); + } + return configMap; + } } diff --git a/sample-operators/webpage/src/main/resources/otlp-config.yaml b/sample-operators/webpage/src/main/resources/otlp-config.yaml new file mode 100644 index 0000000000..30d6f283da --- /dev/null +++ b/sample-operators/webpage/src/main/resources/otlp-config.yaml @@ -0,0 +1,6 @@ +otlp: + # OTLP Collector endpoint - see observability/install-observability.sh for setup + url: "http://otel-collector-collector.observability.svc.cluster.local:4318/v1/metrics" + step: 15s + batchSize: 15000 + aggregationTemporality: "cumulative" From ece63e8f6aaca8c4f1ccf27f7bd95e33f39a27ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Sun, 8 Feb 2026 16:24:42 +0100 Subject: [PATCH 06/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- .../webpage/src/main/resources/otlp-config.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/sample-operators/webpage/src/main/resources/otlp-config.yaml b/sample-operators/webpage/src/main/resources/otlp-config.yaml index 30d6f283da..ca93bfc965 100644 --- a/sample-operators/webpage/src/main/resources/otlp-config.yaml +++ b/sample-operators/webpage/src/main/resources/otlp-config.yaml @@ -1,3 +1,19 @@ +# +# Copyright Java Operator SDK Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + otlp: # OTLP Collector endpoint - see observability/install-observability.sh for setup url: "http://otel-collector-collector.observability.svc.cluster.local:4318/v1/metrics" From 72ca6e8ead29f00d590043b4472b0ed60837fd35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Sun, 8 Feb 2026 16:26:41 +0100 Subject: [PATCH 07/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- grafana/README.md | 225 -------------------------- grafana/install-observability.sh | 264 ------------------------------- 2 files changed, 489 deletions(-) delete mode 100644 grafana/README.md delete mode 100755 grafana/install-observability.sh diff --git a/grafana/README.md b/grafana/README.md deleted file mode 100644 index 35e1167190..0000000000 --- a/grafana/README.md +++ /dev/null @@ -1,225 +0,0 @@ -# Observability Stack for Java Operator SDK - -This directory contains scripts and configuration for setting up a complete observability stack on minikube. - -## Quick Start - -```bash -./install-observability.sh -``` - -This script installs: -- **OpenTelemetry Operator** - For collecting metrics and traces -- **Prometheus** - For metrics storage and querying -- **Grafana** - For visualization and dashboards -- **cert-manager** - Required for OpenTelemetry Operator webhooks - -## Prerequisites - -- kubectl configured -- Helm 3.x installed - -## Components Installed - -### OpenTelemetry Collector -- Receives metrics and traces via OTLP (gRPC and HTTP) -- Exports metrics to Prometheus format -- Configured with memory limiter and batch processing - -**Endpoints:** -- OTLP gRPC: `otel-collector-collector.observability.svc.cluster.local:4317` -- OTLP HTTP: `otel-collector-collector.observability.svc.cluster.local:4318` -- Prometheus metrics: `http://otel-collector-prometheus.observability.svc.cluster.local:8889/metrics` - -### Prometheus -- Scrapes metrics from OpenTelemetry Collector -- Supports ServiceMonitor and PodMonitor CRDs -- Configured to discover all metrics automatically - -**Access:** -```bash -kubectl port-forward -n observability svc/kube-prometheus-stack-prometheus 9090:9090 -``` -Open http://localhost:9090 - -### Grafana -- Pre-configured with Prometheus as data source -- Includes Kubernetes monitoring dashboards - -**Access:** -```bash -kubectl port-forward -n observability svc/kube-prometheus-stack-grafana 3000:80 -``` -Open http://localhost:3000 -- **Username:** admin -- **Password:** admin - -## Integrating with Your Operator - -### 1. Add OpenTelemetry Dependency - -Add to your `pom.xml`: - -```xml - - io.javaoperatorsdk - operator-framework-opentelemetry-support - ${josdk.version} - -``` - -### 2. Configure OpenTelemetry in Your Operator - -In your operator code: - -```java -import io.javaoperatorsdk.operator.monitoring.opentelemetry.OpenTelemetryMetrics; -import io.opentelemetry.api.OpenTelemetry; -import io.opentelemetry.sdk.autoconfigure.AutoConfiguredOpenTelemetrySdk; - -// Initialize OpenTelemetry -OpenTelemetry openTelemetry = AutoConfiguredOpenTelemetrySdk.initialize() - .getOpenTelemetrySdk(); - -// Create JOSDK metrics instance -Metrics metrics = OpenTelemetryMetrics.builder(openTelemetry) - .build(); - -// Configure operator with metrics -Operator operator = new Operator(client, o -> o.withMetrics(metrics)); -``` - -### 3. Set Environment Variables - -In your operator deployment YAML: - -```yaml -env: - - name: OTEL_SERVICE_NAME - value: "your-operator-name" - - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://otel-collector-collector.observability.svc.cluster.local:4318" - - name: OTEL_METRICS_EXPORTER - value: "otlp" - - name: OTEL_TRACES_EXPORTER - value: "otlp" - - name: OTEL_EXPORTER_OTLP_PROTOCOL - value: "http/protobuf" -``` - -## Available JOSDK Metrics - -The following metrics are exported by JOSDK: - -| Metric | Type | Description | -|--------|------|-------------| -| `operator_sdk_reconciliations_started_total` | Counter | Total number of reconciliations started | -| `operator_sdk_reconciliations_success_total` | Counter | Total number of successful reconciliations | -| `operator_sdk_reconciliations_failed_total` | Counter | Total number of failed reconciliations | -| `operator_sdk_reconciliations_queue_size` | Gauge | Current reconciliation queue size | -| `operator_sdk_events_received_total` | Counter | Total number of Kubernetes events received | -| `operator_sdk_controllers_execution_reconcile_seconds` | Timer | Time taken for reconciliations | -| `operator_sdk_controllers_execution_cleanup_seconds` | Timer | Time taken for cleanup operations | - -## Creating Grafana Dashboards - -### Example PromQL Queries - -**Reconciliation Rate:** -```promql -sum(rate(operator_sdk_reconciliations_started_total[5m])) by (controller) -``` - -**Success Rate:** -```promql -sum(rate(operator_sdk_reconciliations_success_total[5m])) / -sum(rate(operator_sdk_reconciliations_started_total[5m])) -``` - -**Error Rate:** -```promql -sum(rate(operator_sdk_reconciliations_failed_total[5m])) by (controller, exception) -``` - -**Queue Size:** -```promql -operator_sdk_reconciliations_queue_size -``` - -**Average Reconciliation Duration:** -```promql -rate(operator_sdk_controllers_execution_reconcile_seconds_sum[5m]) / -rate(operator_sdk_controllers_execution_reconcile_seconds_count[5m]) -``` - -### Sample Dashboard Configuration - -1. Open Grafana (http://localhost:3000) -2. Go to "Dashboards" → "New Dashboard" -3. Add panels with the PromQL queries above -4. Configure visualization types: - - Time series for rates and durations - - Gauge for queue size - - Stat for current values - -## Troubleshooting - -### Check Pod Status -```bash -kubectl get pods -n observability -``` - -### Check OpenTelemetry Collector Logs -```bash -kubectl logs -n observability -l app.kubernetes.io/name=otel-collector -f -``` - -### Check Prometheus Targets -```bash -kubectl port-forward -n observability svc/kube-prometheus-stack-prometheus 9090:9090 -``` -Then open http://localhost:9090/targets - -### Verify Metrics are Being Collected -```bash -# Check if OpenTelemetry is receiving metrics -kubectl port-forward -n observability svc/otel-collector-prometheus 8889:8889 -curl http://localhost:8889/metrics | grep operator_sdk -``` - -### Test OTLP Endpoint -```bash -# Port forward the OTLP HTTP endpoint -kubectl port-forward -n observability svc/otel-collector-collector 4318:4318 - -# Send a test metric (requires curl and valid OTLP JSON) -# This is just for testing connectivity -curl -X POST http://localhost:4318/v1/metrics \ - -H "Content-Type: application/json" \ - -d '{"resourceMetrics":[]}' -``` - -## Uninstalling - -To remove all components: - -```bash -# Delete OpenTelemetry resources -kubectl delete -n observability OpenTelemetryCollector otel-collector - -# Uninstall Helm releases -helm uninstall -n observability kube-prometheus-stack -helm uninstall -n observability opentelemetry-operator -helm uninstall -n cert-manager cert-manager - -# Delete namespaces -kubectl delete namespace observability cert-manager -``` - -## References - -- [JOSDK Observability Documentation](https://javaoperatorsdk.io/docs/documentation/observability/) -- [OpenTelemetry Java Documentation](https://opentelemetry.io/docs/instrumentation/java/) -- [Prometheus Operator](https://github.com/prometheus-operator/prometheus-operator) -- [Grafana Documentation](https://grafana.com/docs/) -- [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/) diff --git a/grafana/install-observability.sh b/grafana/install-observability.sh deleted file mode 100755 index 63bdcb706f..0000000000 --- a/grafana/install-observability.sh +++ /dev/null @@ -1,264 +0,0 @@ -#!/bin/bash -# -# Copyright Java Operator SDK Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -set -e - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -NC='\033[0m' # No Color - -echo -e "${GREEN}========================================${NC}" -echo -e "${GREEN}Installing Observability Stack${NC}" -echo -e "${GREEN}OpenTelemetry + Prometheus + Grafana${NC}" -echo -e "${GREEN}========================================${NC}" - -# Check if minikube is running -echo -e "\n${YELLOW}Checking minikube status...${NC}" -if ! minikube status > /dev/null 2>&1; then - echo -e "${RED}Error: minikube is not running${NC}" - echo "Please start minikube with: minikube start" - exit 1 -fi -echo -e "${GREEN}✓ minikube is running${NC}" - -# Check if helm is installed -echo -e "\n${YELLOW}Checking helm installation...${NC}" -if ! command -v helm &> /dev/null; then - echo -e "${RED}Error: helm is not installed${NC}" - echo "Please install helm: https://helm.sh/docs/intro/install/" - exit 1 -fi -echo -e "${GREEN}✓ helm is installed${NC}" - -# Add Helm repositories -echo -e "\n${YELLOW}Adding Helm repositories...${NC}" -helm repo add jetstack https://charts.jetstack.io -helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts -helm repo add prometheus-community https://prometheus-community.github.io/helm-charts -helm repo update -echo -e "${GREEN}✓ Helm repositories added${NC}" - -# Install cert-manager (required for OpenTelemetry Operator) -echo -e "\n${YELLOW}Installing cert-manager...${NC}" -if kubectl get namespace cert-manager > /dev/null 2>&1; then - echo -e "${YELLOW}cert-manager namespace already exists, skipping...${NC}" -else - kubectl create namespace cert-manager - helm install cert-manager jetstack/cert-manager \ - --namespace cert-manager \ - --set crds.enabled=true \ - --wait - echo -e "${GREEN}✓ cert-manager installed${NC}" -fi - -# Create observability namespace -echo -e "\n${YELLOW}Creating observability namespace...${NC}" -kubectl create namespace observability --dry-run=client -o yaml | kubectl apply -f - -echo -e "${GREEN}✓ observability namespace ready${NC}" - -# Install OpenTelemetry Operator -echo -e "\n${YELLOW}Installing OpenTelemetry Operator...${NC}" -if helm list -n observability | grep -q opentelemetry-operator; then - echo -e "${YELLOW}OpenTelemetry Operator already installed, upgrading...${NC}" - helm upgrade opentelemetry-operator open-telemetry/opentelemetry-operator \ - --namespace observability \ - --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" \ - --wait -else - helm install opentelemetry-operator open-telemetry/opentelemetry-operator \ - --namespace observability \ - --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" \ - --wait -fi -echo -e "${GREEN}✓ OpenTelemetry Operator installed${NC}" - -# Install kube-prometheus-stack (includes Prometheus + Grafana) -echo -e "\n${YELLOW}Installing Prometheus and Grafana stack...${NC}" -if helm list -n observability | grep -q kube-prometheus-stack; then - echo -e "${YELLOW}kube-prometheus-stack already installed, upgrading...${NC}" - helm upgrade kube-prometheus-stack prometheus-community/kube-prometheus-stack \ - --namespace observability \ - --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ - --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \ - --set grafana.adminPassword=admin \ - --wait -else - helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack \ - --namespace observability \ - --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ - --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \ - --set grafana.adminPassword=admin \ - --wait -fi -echo -e "${GREEN}✓ Prometheus and Grafana installed${NC}" - -# Create OpenTelemetry Collector instance -echo -e "\n${YELLOW}Creating OpenTelemetry Collector...${NC}" -kubectl apply -f - < Date: Mon, 9 Feb 2026 09:16:51 +0100 Subject: [PATCH 08/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- observability/install-observability.sh | 41 ++++++++++++++++--- .../operator/sample/WebPageOperator.java | 21 +++++++++- 2 files changed, 56 insertions(+), 6 deletions(-) diff --git a/observability/install-observability.sh b/observability/install-observability.sh index 314ee1e4aa..2c81f2bf38 100755 --- a/observability/install-observability.sh +++ b/observability/install-observability.sh @@ -199,6 +199,36 @@ echo -e "\n${YELLOW}Waiting for all pods to be ready...${NC}" kubectl wait --for=condition=ready pod --all -n observability --timeout=300s echo -e "${GREEN}✓ All pods are ready${NC}" +# Import Grafana dashboards +echo -e "\n${YELLOW}Importing Grafana dashboards...${NC}" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +if [ -f "$SCRIPT_DIR/jvm-metrics-dashboard.json" ]; then + kubectl create configmap jvm-metrics-dashboard \ + --from-file="$SCRIPT_DIR/jvm-metrics-dashboard.json" \ + -n observability \ + --dry-run=client -o yaml | \ + kubectl label --dry-run=client --local -f - grafana_dashboard=1 -o yaml | \ + kubectl apply -f - + echo -e "${GREEN}✓ JVM Metrics dashboard imported${NC}" +else + echo -e "${YELLOW}⚠ JVM Metrics dashboard not found at $SCRIPT_DIR/jvm-metrics-dashboard.json${NC}" +fi + +if [ -f "$SCRIPT_DIR/josdk-operator-metrics-dashboard.json" ]; then + kubectl create configmap josdk-operator-metrics-dashboard \ + --from-file="$SCRIPT_DIR/josdk-operator-metrics-dashboard.json" \ + -n observability \ + --dry-run=client -o yaml | \ + kubectl label --dry-run=client --local -f - grafana_dashboard=1 -o yaml | \ + kubectl apply -f - + echo -e "${GREEN}✓ JOSDK Operator Metrics dashboard imported${NC}" +else + echo -e "${YELLOW}⚠ JOSDK Operator Metrics dashboard not found at $SCRIPT_DIR/josdk-operator-metrics-dashboard.json${NC}" +fi + +echo -e "${GREEN}✓ Dashboards will be available in Grafana shortly${NC}" + # Get pod statuses echo -e "\n${GREEN}========================================${NC}" echo -e "${GREEN}Installation Complete!${NC}" @@ -237,16 +267,17 @@ echo -e " ${GREEN}OTEL_TRACES_EXPORTER=otlp${NC}" echo -e "\n${GREEN}========================================${NC}" echo -e "${GREEN}Grafana Dashboards${NC}" echo -e "${GREEN}========================================${NC}" -echo -e "\nPre-installed dashboards in Grafana:" +echo -e "\nAutomatically imported dashboards:" +echo -e " - ${GREEN}JOSDK - JVM Metrics${NC} - Java Virtual Machine health and performance" +echo -e " - ${GREEN}JOSDK - Operator Metrics${NC} - Kubernetes operator performance and reconciliation" +echo -e "\nPre-installed Kubernetes dashboards:" echo -e " - Kubernetes / Compute Resources / Cluster" echo -e " - Kubernetes / Compute Resources / Namespace (Pods)" echo -e " - Node Exporter / Nodes" -echo -e "\nFor JOSDK metrics, create a custom dashboard with queries like:" -echo -e " ${GREEN}sum(rate(operator_sdk_reconciliations_started_total[5m]))${NC}" -echo -e " ${GREEN}sum(rate(operator_sdk_reconciliations_success_total[5m]))${NC}" -echo -e " ${GREEN}sum(rate(operator_sdk_reconciliations_failed_total[5m]))${NC}" +echo -e "\n${YELLOW}Note:${NC} Dashboards may take 30-60 seconds to appear in Grafana after installation." echo -e "\n${YELLOW}To uninstall:${NC}" +echo -e " kubectl delete configmap -n observability jvm-metrics-dashboard josdk-operator-metrics-dashboard" echo -e " kubectl delete -n observability OpenTelemetryCollector otel-collector" echo -e " helm uninstall -n observability kube-prometheus-stack" echo -e " helm uninstall -n observability opentelemetry-operator" diff --git a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java index 78c05f8df7..a2c342dc5e 100644 --- a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java +++ b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java @@ -33,6 +33,12 @@ import io.javaoperatorsdk.operator.sample.probes.StartupHandler; import io.micrometer.core.instrument.Clock; import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.binder.jvm.ClassLoaderMetrics; +import io.micrometer.core.instrument.binder.jvm.JvmGcMetrics; +import io.micrometer.core.instrument.binder.jvm.JvmMemoryMetrics; +import io.micrometer.core.instrument.binder.jvm.JvmThreadMetrics; +import io.micrometer.core.instrument.binder.system.ProcessorMetrics; +import io.micrometer.core.instrument.binder.system.UptimeMetrics; import io.micrometer.registry.otlp.OtlpConfig; import io.micrometer.registry.otlp.OtlpMeterRegistry; @@ -78,7 +84,20 @@ public static void main(String[] args) throws IOException { OtlpConfig otlpConfig = configProperties::get; MeterRegistry registry = new OtlpMeterRegistry(otlpConfig, Clock.SYSTEM); - return MicrometerMetrics.withoutPerResourceMetrics(registry); + + // Register JVM and system metrics + log.info("Registering JVM and system metrics..."); + new JvmMemoryMetrics().bindTo(registry); + new JvmGcMetrics().bindTo(registry); + new JvmThreadMetrics().bindTo(registry); + new ClassLoaderMetrics().bindTo(registry); + new ProcessorMetrics().bindTo(registry); + new UptimeMetrics().bindTo(registry); + log.info("JVM and system metrics registered"); + + return MicrometerMetrics.newPerResourceCollectingMicrometerMetricsBuilder(registry) + .collectingMetricsPerResource() + .build(); } @SuppressWarnings("unchecked") From 899e34564857f872d7e257f65c128a110412b153 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Mon, 9 Feb 2026 09:27:30 +0100 Subject: [PATCH 09/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- observability/README.md | 246 ++++ .../josdk-operator-metrics-dashboard.json | 1109 +++++++++++++++++ observability/jvm-metrics-dashboard.json | 857 +++++++++++++ 3 files changed, 2212 insertions(+) create mode 100644 observability/README.md create mode 100644 observability/josdk-operator-metrics-dashboard.json create mode 100644 observability/jvm-metrics-dashboard.json diff --git a/observability/README.md b/observability/README.md new file mode 100644 index 0000000000..9706a466e9 --- /dev/null +++ b/observability/README.md @@ -0,0 +1,246 @@ +# Observability Stack for Java Operator SDK + +This directory contains the setup scripts and Grafana dashboards for monitoring Java Operator SDK applications. + +## Installation + +Run the installation script to deploy the full observability stack (OpenTelemetry Collector, Prometheus, and Grafana): + +```bash +./install-observability.sh +``` + +This will install: +- **cert-manager** - Required for OpenTelemetry Operator +- **OpenTelemetry Operator** - Manages OpenTelemetry Collector instances +- **OpenTelemetry Collector** - Receives OTLP metrics and exports to Prometheus +- **Prometheus** - Metrics storage and querying +- **Grafana** - Metrics visualization + +## Accessing Services + +### Grafana +```bash +kubectl port-forward -n observability svc/kube-prometheus-stack-grafana 3000:80 +``` +Then open http://localhost:3000 +- Username: `admin` +- Password: `admin` + +### Prometheus +```bash +kubectl port-forward -n observability svc/kube-prometheus-stack-prometheus 9090:9090 +``` +Then open http://localhost:9090 + +## Grafana Dashboards + +Two pre-configured dashboards are **automatically imported** during installation: + +### 1. JVM Metrics Dashboard (`jvm-metrics-dashboard.json`) + +Monitors Java Virtual Machine health and performance: + +**Panels:** +- **JVM Memory Used** - Heap and non-heap memory consumption by memory pool +- **JVM Threads** - Live, daemon, and peak thread counts +- **GC Pause Time Rate** - Garbage collection pause duration +- **GC Pause Count Rate** - Frequency of garbage collection events +- **CPU Usage** - System CPU utilization percentage +- **Classes Loaded** - Number of classes currently loaded +- **Process Uptime** - Application uptime in seconds +- **CPU Count** - Available processor cores +- **GC Memory Allocation Rate** - Memory allocation and promotion rates +- **Heap Memory Max vs Committed** - Heap memory limits and commitments + +**Key Metrics:** +- `jvm.memory.used`, `jvm.memory.max`, `jvm.memory.committed` +- `jvm.gc.pause`, `jvm.gc.memory.allocated`, `jvm.gc.memory.promoted` +- `jvm.threads.live`, `jvm.threads.daemon`, `jvm.threads.peak` +- `jvm.classes.loaded`, `jvm.classes.unloaded` +- `system.cpu.usage`, `system.cpu.count` +- `process.uptime` + +### 2. Java Operator SDK Metrics Dashboard (`josdk-operator-metrics-dashboard.json`) + +Monitors Kubernetes operator performance and health: + +**Panels:** +- **Reconciliation Rate (Started)** - Rate of reconciliation loops triggered +- **Reconciliation Success vs Failure Rate** - Success/failure ratio over time +- **Currently Executing Reconciliations** - Active reconciliation threads +- **Reconciliation Queue Size** - Pending reconciliation work +- **Total Reconciliations** - Cumulative count of reconciliations +- **Error Rate** - Overall error rate across all reconciliations +- **Reconciliation Execution Time** - P50, P95, P99 latency percentiles +- **Event Reception Rate** - Kubernetes event processing rate +- **Failures by Exception Type** - Breakdown of errors by exception class +- **Controller Execution Success vs Failure** - Controller-level success metrics +- **Delete Event Rate** - Resource deletion event frequency +- **Reconciliation Retry Rate** - Retry attempts and patterns + +**Key Metrics:** +- `operator.sdk.reconciliations.started`, `.success`, `.failed` +- `operator.sdk.reconciliations.executions` - Current execution count +- `operator.sdk.reconciliations.queue.size` - Queue depth +- `operator.sdk.controllers.execution.reconcile` - Execution timing histograms +- `operator.sdk.events.received`, `.delete` - Event reception +- Retry metrics and failure breakdowns + +## Importing Dashboards into Grafana + +### Automatic Import (Default) + +The dashboards are **automatically imported** when you run `./install-observability.sh`. They will appear in Grafana within 30-60 seconds after installation. No manual steps required! + +To verify the dashboards were imported: +1. Access Grafana at http://localhost:3000 +2. Navigate to **Dashboards** → **Browse** +3. Look for "JOSDK - JVM Metrics" and "JOSDK - Operator Metrics" + +### Manual Import Methods + +If you need to re-import or update the dashboards manually: + +#### Method 1: Via Grafana UI + +1. Access Grafana at http://localhost:3000 +2. Login with admin/admin +3. Navigate to **Dashboards** → **Import** +4. Click **Upload JSON file** +5. Select `jvm-metrics-dashboard.json` or `josdk-operator-metrics-dashboard.json` +6. Select **Prometheus** as the data source +7. Click **Import** + +#### Method 2: Via kubectl ConfigMap + +```bash +# Re-import JVM dashboard +kubectl create configmap jvm-metrics-dashboard \ + --from-file=jvm-metrics-dashboard.json \ + -n observability \ + -o yaml --dry-run=client | \ + kubectl label --dry-run=client --local -f - grafana_dashboard=1 -o yaml | \ + kubectl apply -f - + +# Re-import Operator dashboard +kubectl create configmap josdk-operator-metrics-dashboard \ + --from-file=josdk-operator-metrics-dashboard.json \ + -n observability \ + -o yaml --dry-run=client | \ + kubectl label --dry-run=client --local -f - grafana_dashboard=1 -o yaml | \ + kubectl apply -f - +``` + +The dashboards will be automatically discovered and loaded by Grafana within 30-60 seconds. + +## Configuring Your Operator + +To enable metrics export from your JOSDK operator, ensure your application: + +1. **Has the required dependency** (already included in webpage sample): + ```xml + + io.micrometer + micrometer-registry-otlp + + ``` + +2. **Configures OTLP export** via `otlp-config.yaml`: + ```yaml + otlp: + url: "http://otel-collector-collector.observability.svc.cluster.local:4318/v1/metrics" + step: 15s + batchSize: 15000 + aggregationTemporality: "cumulative" + ``` + +3. **Registers JVM and JOSDK metrics** (see `WebPageOperator.java` for reference implementation) + +## OTLP Endpoints + +The OpenTelemetry Collector provides the following endpoints: + +- **OTLP gRPC**: `otel-collector-collector.observability.svc.cluster.local:4317` +- **OTLP HTTP**: `otel-collector-collector.observability.svc.cluster.local:4318` +- **Prometheus Scrape**: `http://otel-collector-prometheus.observability.svc.cluster.local:8889/metrics` + +## Troubleshooting + +### Check OpenTelemetry Collector Logs +```bash +kubectl logs -n observability -l app.kubernetes.io/name=otel-collector -f +``` + +### Check Prometheus Targets +```bash +kubectl port-forward -n observability svc/kube-prometheus-stack-prometheus 9090:9090 +``` +Open http://localhost:9090/targets and verify the OTLP collector target is UP. + +### Verify Metrics in Prometheus +Open Prometheus UI and search for metrics: +- JVM metrics: `otel_jvm_*` +- Operator metrics: `otel_operator_sdk_*` + +### Check Grafana Data Source +1. Navigate to **Configuration** → **Data Sources** +2. Verify Prometheus data source is configured and working +3. Click **Test** to verify connectivity + +## Uninstalling + +To remove the observability stack: + +```bash +kubectl delete configmap -n observability jvm-metrics-dashboard josdk-operator-metrics-dashboard +kubectl delete -n observability OpenTelemetryCollector otel-collector +helm uninstall -n observability kube-prometheus-stack +helm uninstall -n observability opentelemetry-operator +helm uninstall -n cert-manager cert-manager +kubectl delete namespace observability cert-manager +``` + +## Customizing Dashboards + +The dashboard JSON files can be modified to: +- Add new panels for custom metrics +- Adjust time ranges and refresh intervals +- Change visualization types +- Add templating variables for filtering +- Modify alert thresholds + +After making changes, re-import the dashboard using one of the methods above. + +## Example Queries + +### JVM Metrics +```promql +# Heap memory usage percentage +(otel_jvm_memory_used_bytes{area="heap"} / otel_jvm_memory_max_bytes{area="heap"}) * 100 + +# GC throughput (percentage of time NOT in GC) +100 - (rate(otel_jvm_gc_pause_seconds_sum[5m]) * 100) + +# Thread count trend +otel_jvm_threads_live_threads +``` + +### Operator Metrics +```promql +# Reconciliation success rate +rate(otel_operator_sdk_reconciliations_success_total[5m]) / rate(otel_operator_sdk_reconciliations_started_total[5m]) + +# Average reconciliation time +rate(otel_operator_sdk_controllers_execution_reconcile_seconds_sum[5m]) / rate(otel_operator_sdk_controllers_execution_reconcile_seconds_count[5m]) + +# Queue saturation +otel_operator_sdk_reconciliations_queue_size / on() group_left() max(otel_operator_sdk_reconciliations_queue_size) +``` + +## References + +- [Java Operator SDK Documentation](https://javaoperatorsdk.io) +- [Micrometer OTLP Documentation](https://micrometer.io/docs/registry/otlp) +- [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/) +- [Grafana Dashboards](https://grafana.com/docs/grafana/latest/dashboards/) diff --git a/observability/josdk-operator-metrics-dashboard.json b/observability/josdk-operator-metrics-dashboard.json new file mode 100644 index 0000000000..006821a467 --- /dev/null +++ b/observability/josdk-operator-metrics-dashboard.json @@ -0,0 +1,1109 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Rate of reconciliations started per second", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": ["last", "mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_reconciliations_started_total{job=\"webpage-operator\"}[5m])) by (kind, version)", + "legendFormat": "{{kind}} ({{version}})", + "range": true, + "refId": "A" + } + ], + "title": "Reconciliation Rate (Started)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Success vs Failure rate of reconciliations", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failure" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": ["last", "mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_reconciliations_success_total{job=\"webpage-operator\"}[5m]))", + "legendFormat": "Success", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_reconciliations_failed_total{job=\"webpage-operator\"}[5m]))", + "legendFormat": "Failure", + "range": true, + "refId": "B" + } + ], + "title": "Reconciliation Success vs Failure Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Current number of reconciliations being executed", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(otel_operator_sdk_reconciliations_executions{job=\"webpage-operator\"})", + "legendFormat": "Executing", + "range": true, + "refId": "A" + } + ], + "title": "Currently Executing Reconciliations", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Current reconciliation queue size", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "red", + "value": 50 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 8 + }, + "id": 4, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(otel_operator_sdk_reconciliations_queue_size{job=\"webpage-operator\"})", + "legendFormat": "Queue Size", + "range": true, + "refId": "A" + } + ], + "title": "Reconciliation Queue Size", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Total reconciliations started", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 8 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(otel_operator_sdk_reconciliations_started_total{job=\"webpage-operator\"})", + "legendFormat": "Total", + "range": true, + "refId": "A" + } + ], + "title": "Total Reconciliations", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Error rate by exception type", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 8 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_reconciliations_failed_total{job=\"webpage-operator\"}[5m]))", + "legendFormat": "Error Rate", + "range": true, + "refId": "A" + } + ], + "title": "Error Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Controller execution time percentiles", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 7, + "options": { + "legend": { + "calcs": ["last", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum(rate(otel_operator_sdk_controllers_execution_reconcile_seconds_bucket{job=\"webpage-operator\"}[5m])) by (le, controller))", + "legendFormat": "p50 - {{controller}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(otel_operator_sdk_controllers_execution_reconcile_seconds_bucket{job=\"webpage-operator\"}[5m])) by (le, controller))", + "legendFormat": "p95 - {{controller}}", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum(rate(otel_operator_sdk_controllers_execution_reconcile_seconds_bucket{job=\"webpage-operator\"}[5m])) by (le, controller))", + "legendFormat": "p99 - {{controller}}", + "range": true, + "refId": "C" + } + ], + "title": "Reconciliation Execution Time (Percentiles)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Rate of events received by the operator", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 8, + "options": { + "legend": { + "calcs": ["last", "mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_events_received_total{job=\"webpage-operator\"}[5m])) by (event, action)", + "legendFormat": "{{event}} - {{action}}", + "range": true, + "refId": "A" + } + ], + "title": "Event Reception Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Failures by exception type", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 9, + "options": { + "legend": { + "calcs": ["last", "sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_reconciliations_failed_total{job=\"webpage-operator\"}[5m])) by (exception)", + "legendFormat": "{{exception}}", + "range": true, + "refId": "A" + } + ], + "title": "Failures by Exception Type", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Controller execution success vs failure", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 10, + "options": { + "legend": { + "calcs": ["last", "mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_controllers_execution_reconcile_success_total{job=\"webpage-operator\"}[5m])) by (type)", + "legendFormat": "Success - {{type}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_controllers_execution_reconcile_failure_total{job=\"webpage-operator\"}[5m])) by (exception)", + "legendFormat": "Failure - {{exception}}", + "range": true, + "refId": "B" + } + ], + "title": "Controller Execution Success vs Failure", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Rate of delete events received", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 11, + "options": { + "legend": { + "calcs": ["last", "sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_events_delete_total{job=\"webpage-operator\"}[5m])) by (kind, version)", + "legendFormat": "{{kind}} ({{version}})", + "range": true, + "refId": "A" + } + ], + "title": "Delete Event Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Reconciliation retry information", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 12, + "options": { + "legend": { + "calcs": ["last", "mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_reconciliations_started_total{job=\"webpage-operator\", operator_sdk_reconciliations_retries_last=\"true\"}[5m]))", + "legendFormat": "Last Retry Attempts", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(otel_operator_sdk_reconciliations_started_total{job=\"webpage-operator\", operator_sdk_reconciliations_retries_last=\"false\"}[5m]))", + "legendFormat": "Retries (Not Last)", + "range": true, + "refId": "B" + } + ], + "title": "Reconciliation Retry Rate", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 38, + "style": "dark", + "tags": ["operator", "kubernetes", "josdk"], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "JOSDK - Operator Metrics", + "uid": "josdk-operator-metrics", + "version": 0, + "weekStart": "" +} diff --git a/observability/jvm-metrics-dashboard.json b/observability/jvm-metrics-dashboard.json new file mode 100644 index 0000000000..0a817aa09c --- /dev/null +++ b/observability/jvm-metrics-dashboard.json @@ -0,0 +1,857 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": ["last", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "otel_jvm_memory_used_bytes{job=\"webpage-operator\"}", + "legendFormat": "{{area}} - {{id}}", + "range": true, + "refId": "A" + } + ], + "title": "JVM Memory Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": ["last", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "otel_jvm_threads_live_threads{job=\"webpage-operator\"}", + "legendFormat": "Live Threads", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "otel_jvm_threads_daemon_threads{job=\"webpage-operator\"}", + "legendFormat": "Daemon Threads", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "otel_jvm_threads_peak_threads{job=\"webpage-operator\"}", + "legendFormat": "Peak Threads", + "range": true, + "refId": "C" + } + ], + "title": "JVM Threads", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": ["last", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(otel_jvm_gc_pause_seconds_sum{job=\"webpage-operator\"}[5m])", + "legendFormat": "{{action}} - {{cause}}", + "range": true, + "refId": "A" + } + ], + "title": "GC Pause Time Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": ["last"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(otel_jvm_gc_pause_seconds_count{job=\"webpage-operator\"}[5m])", + "legendFormat": "{{action}} - {{cause}}", + "range": true, + "refId": "A" + } + ], + "title": "GC Pause Count Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "otel_system_cpu_usage{job=\"webpage-operator\"}", + "legendFormat": "CPU Usage", + "range": true, + "refId": "A" + } + ], + "title": "CPU Usage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 16 + }, + "id": 6, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "otel_jvm_classes_loaded_classes{job=\"webpage-operator\"}", + "legendFormat": "Classes Loaded", + "range": true, + "refId": "A" + } + ], + "title": "Classes Loaded", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 16 + }, + "id": 7, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "otel_process_uptime_seconds{job=\"webpage-operator\"}", + "legendFormat": "Uptime", + "range": true, + "refId": "A" + } + ], + "title": "Process Uptime", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 16 + }, + "id": 8, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": ["lastNotNull"], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "otel_system_cpu_count{job=\"webpage-operator\"}", + "legendFormat": "CPU Count", + "range": true, + "refId": "A" + } + ], + "title": "CPU Count", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 9, + "options": { + "legend": { + "calcs": ["last"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(otel_jvm_gc_memory_allocated_bytes_total{job=\"webpage-operator\"}[5m])", + "legendFormat": "Allocated", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(otel_jvm_gc_memory_promoted_bytes_total{job=\"webpage-operator\"}[5m])", + "legendFormat": "Promoted", + "range": true, + "refId": "B" + } + ], + "title": "GC Memory Allocation Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 10, + "options": { + "legend": { + "calcs": ["last", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "otel_jvm_memory_max_bytes{job=\"webpage-operator\", area=\"heap\"}", + "legendFormat": "Max Heap", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "otel_jvm_memory_committed_bytes{job=\"webpage-operator\", area=\"heap\"}", + "legendFormat": "Committed Heap", + "range": true, + "refId": "B" + } + ], + "title": "Heap Memory Max vs Committed", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 38, + "style": "dark", + "tags": ["jvm", "java", "josdk"], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "JOSDK - JVM Metrics", + "uid": "josdk-jvm-metrics", + "version": 0, + "weekStart": "" +} From ff05901f8a5089229a28b1ad70288ff69b044bb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Mon, 9 Feb 2026 10:06:15 +0100 Subject: [PATCH 10/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- observability/install-observability.sh | 45 +++++++++++++------ .../operator/sample/WebPageOperator.java | 4 +- .../webpage/src/main/resources/log4j2.xml | 2 +- .../src/main/resources/otlp-config.yaml | 3 +- 4 files changed, 36 insertions(+), 18 deletions(-) diff --git a/observability/install-observability.sh b/observability/install-observability.sh index 2c81f2bf38..017e9adf86 100755 --- a/observability/install-observability.sh +++ b/observability/install-observability.sh @@ -45,6 +45,17 @@ helm repo add prometheus-community https://prometheus-community.github.io/helm-c helm repo update echo -e "${GREEN}✓ Helm repositories added${NC}" +echo -e "\n${GREEN}========================================${NC}" +echo -e "${GREEN}Installing Components (Parallel)${NC}" +echo -e "${GREEN}========================================${NC}" +echo -e "The following will be installed:" +echo -e " • cert-manager" +echo -e " • OpenTelemetry Operator" +echo -e " • Prometheus & Grafana" +echo -e " • OpenTelemetry Collector" +echo -e " • Service Monitors" +echo -e "\n${YELLOW}All resources will be applied first, then we'll wait for them to become ready.${NC}\n" + # Install cert-manager (required for OpenTelemetry Operator) echo -e "\n${YELLOW}Installing cert-manager...${NC}" if kubectl get namespace cert-manager > /dev/null 2>&1; then @@ -53,9 +64,8 @@ else kubectl create namespace cert-manager helm install cert-manager jetstack/cert-manager \ --namespace cert-manager \ - --set crds.enabled=true \ - --wait - echo -e "${GREEN}✓ cert-manager installed${NC}" + --set crds.enabled=true + echo -e "${GREEN}✓ cert-manager installation started${NC}" fi # Create observability namespace @@ -70,15 +80,13 @@ if helm list -n observability | grep -q opentelemetry-operator; then echo -e "${YELLOW}OpenTelemetry Operator already installed, upgrading...${NC}" helm upgrade opentelemetry-operator open-telemetry/opentelemetry-operator \ --namespace observability \ - --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" \ - --wait + --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" else helm install opentelemetry-operator open-telemetry/opentelemetry-operator \ --namespace observability \ - --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" \ - --wait + --set "manager.collectorImage.repository=otel/opentelemetry-collector-contrib" fi -echo -e "${GREEN}✓ OpenTelemetry Operator installed${NC}" +echo -e "${GREEN}✓ OpenTelemetry Operator installation started${NC}" # Install kube-prometheus-stack (includes Prometheus + Grafana) echo -e "\n${YELLOW}Installing Prometheus and Grafana stack...${NC}" @@ -88,17 +96,15 @@ if helm list -n observability | grep -q kube-prometheus-stack; then --namespace observability \ --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \ - --set grafana.adminPassword=admin \ - --wait + --set grafana.adminPassword=admin else helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack \ --namespace observability \ --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \ - --set grafana.adminPassword=admin \ - --wait + --set grafana.adminPassword=admin fi -echo -e "${GREEN}✓ Prometheus and Grafana installed${NC}" +echo -e "${GREEN}✓ Prometheus and Grafana installation started${NC}" # Create OpenTelemetry Collector instance echo -e "\n${YELLOW}Creating OpenTelemetry Collector...${NC}" @@ -195,8 +201,19 @@ EOF echo -e "${GREEN}✓ ServiceMonitor created${NC}" # Wait for all pods to be ready -echo -e "\n${YELLOW}Waiting for all pods to be ready...${NC}" +echo -e "\n${GREEN}========================================${NC}" +echo -e "${GREEN}All resources have been applied!${NC}" +echo -e "${GREEN}========================================${NC}" +echo -e "\n${YELLOW}Waiting for all pods to become ready (this may take 2-3 minutes)...${NC}" + +# Wait for cert-manager pods +echo -e "${YELLOW}Checking cert-manager pods...${NC}" +kubectl wait --for=condition=ready pod --all -n cert-manager --timeout=300s 2>/dev/null || echo -e "${YELLOW}cert-manager already running or skipped${NC}" + +# Wait for observability pods +echo -e "${YELLOW}Checking observability pods...${NC}" kubectl wait --for=condition=ready pod --all -n observability --timeout=300s + echo -e "${GREEN}✓ All pods are ready${NC}" # Import Grafana dashboards diff --git a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java index a2c342dc5e..dd1155eab3 100644 --- a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java +++ b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java @@ -61,6 +61,7 @@ public static void main(String[] args) throws IOException { Metrics metrics = initOTLPMetrics(); Operator operator = new Operator(o -> o.withStopOnInformerErrorDuringStartup(false).withMetrics(metrics)); + String reconcilerEnvVar = System.getenv(WEBPAGE_RECONCILER_ENV); if (WEBPAGE_CLASSIC_RECONCILER_ENV_VALUE.equals(reconcilerEnvVar)) { operator.register(new WebPageReconciler()); @@ -81,7 +82,7 @@ public static void main(String[] args) throws IOException { private static @NonNull Metrics initOTLPMetrics() { Map configProperties = loadConfigFromYaml(); - OtlpConfig otlpConfig = configProperties::get; + OtlpConfig otlpConfig = key -> configProperties.get(key); MeterRegistry registry = new OtlpMeterRegistry(otlpConfig, Clock.SYSTEM); @@ -93,7 +94,6 @@ public static void main(String[] args) throws IOException { new ClassLoaderMetrics().bindTo(registry); new ProcessorMetrics().bindTo(registry); new UptimeMetrics().bindTo(registry); - log.info("JVM and system metrics registered"); return MicrometerMetrics.newPerResourceCollectingMicrometerMetricsBuilder(registry) .collectingMetricsPerResource() diff --git a/sample-operators/webpage/src/main/resources/log4j2.xml b/sample-operators/webpage/src/main/resources/log4j2.xml index 0bf270c7e6..ebe273e40e 100644 --- a/sample-operators/webpage/src/main/resources/log4j2.xml +++ b/sample-operators/webpage/src/main/resources/log4j2.xml @@ -23,7 +23,7 @@ - + diff --git a/sample-operators/webpage/src/main/resources/otlp-config.yaml b/sample-operators/webpage/src/main/resources/otlp-config.yaml index ca93bfc965..17d773eb70 100644 --- a/sample-operators/webpage/src/main/resources/otlp-config.yaml +++ b/sample-operators/webpage/src/main/resources/otlp-config.yaml @@ -16,7 +16,8 @@ otlp: # OTLP Collector endpoint - see observability/install-observability.sh for setup - url: "http://otel-collector-collector.observability.svc.cluster.local:4318/v1/metrics" + url: "http://localhost:4318/v1/metrics" +# url: "http://otel-collector-collector.observability.svc.cluster.local:4318/v1/metrics" step: 15s batchSize: 15000 aggregationTemporality: "cumulative" From f7e2565189dc61e6499d7e23547d8ae9a25e2500 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Mon, 9 Feb 2026 10:19:39 +0100 Subject: [PATCH 11/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- sample-operators/webpage/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sample-operators/webpage/README.md b/sample-operators/webpage/README.md index 7718d0f2f3..96329d18a9 100644 --- a/sample-operators/webpage/README.md +++ b/sample-operators/webpage/README.md @@ -76,3 +76,6 @@ of your choice. The JAR file is built using your local Maven and JDK and then co 1. Deploy the CRD: `kubectl apply -f target/classes/META-INF/fabric8/webpages.sample.javaoperatorsdk-v1.yml` 2. Deploy the operator: `kubectl apply -f k8s/operator.yaml` + +To install observability components - such as Prometheus, Open Telemetry, Grafana use - execute: +[install-observability.sh](../../observability/install-observability.sh) From 98f200f2b158d1d3f1e9c7fded43ef40cce3c6af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Mon, 9 Feb 2026 15:46:11 +0100 Subject: [PATCH 12/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- observability/install-observability.sh | 3 ++- sample-operators/webpage/pom.xml | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/observability/install-observability.sh b/observability/install-observability.sh index 017e9adf86..ea3a083eec 100755 --- a/observability/install-observability.sh +++ b/observability/install-observability.sh @@ -181,7 +181,7 @@ spec: targetPort: 8889 protocol: TCP selector: - app.kubernetes.io/name: otel-collector + app.kubernetes.io/name: otel-collector-collector --- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor @@ -190,6 +190,7 @@ metadata: namespace: observability labels: app: otel-collector + release: kube-prometheus-stack spec: selector: matchLabels: diff --git a/sample-operators/webpage/pom.xml b/sample-operators/webpage/pom.xml index 97c885e403..10b0352605 100644 --- a/sample-operators/webpage/pom.xml +++ b/sample-operators/webpage/pom.xml @@ -39,6 +39,13 @@ pom import + + io.micrometer + micrometer-bom + ${micrometer-core.version} + pom + import + From 77307e23297cbc817f3e25bc32f0618ee6b69943 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Mon, 9 Feb 2026 15:47:03 +0100 Subject: [PATCH 13/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- sample-operators/webpage/pom.xml | 14 +++++++------- .../operator/sample/WebPageOperator.java | 1 + 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/sample-operators/webpage/pom.xml b/sample-operators/webpage/pom.xml index 10b0352605..f8c79cf268 100644 --- a/sample-operators/webpage/pom.xml +++ b/sample-operators/webpage/pom.xml @@ -39,13 +39,13 @@ pom import - - io.micrometer - micrometer-bom - ${micrometer-core.version} - pom - import - + + io.micrometer + micrometer-bom + ${micrometer-core.version} + pom + import + diff --git a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java index dd1155eab3..837963f00a 100644 --- a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java +++ b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java @@ -57,6 +57,7 @@ public class WebPageOperator { public static void main(String[] args) throws IOException { log.info("WebServer Operator starting!"); + // TODO remove otel prefix, add job and additional labels?! // Load configuration from config.yaml Metrics metrics = initOTLPMetrics(); Operator operator = From 1daab47fa8dc6f99620f80482d9d0abfaa3a767e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Mon, 9 Feb 2026 16:18:30 +0100 Subject: [PATCH 14/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- .../java/io/javaoperatorsdk/operator/sample/WebPageOperator.java | 1 + 1 file changed, 1 insertion(+) diff --git a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java index 837963f00a..d92dfdd863 100644 --- a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java +++ b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java @@ -58,6 +58,7 @@ public static void main(String[] args) throws IOException { log.info("WebServer Operator starting!"); // TODO remove otel prefix, add job and additional labels?! + // TODO add test for checking if there are metrics in prometheus // Load configuration from config.yaml Metrics metrics = initOTLPMetrics(); Operator operator = From cefad784386786f79891ff849f72c5a58b94980e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Mon, 9 Feb 2026 16:59:54 +0100 Subject: [PATCH 15/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- observability/install-observability.sh | 3 ++- .../operator/sample/WebPageOperator.java | 18 +++++++++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/observability/install-observability.sh b/observability/install-observability.sh index ea3a083eec..e724ac54d4 100755 --- a/observability/install-observability.sh +++ b/observability/install-observability.sh @@ -143,7 +143,7 @@ spec: exporters: prometheus: endpoint: "0.0.0.0:8889" - namespace: "otel" + namespace: "" send_timestamps: true metric_expiration: 5m debug: @@ -192,6 +192,7 @@ metadata: app: otel-collector release: kube-prometheus-stack spec: + jobLabel: app selector: matchLabels: app: otel-collector diff --git a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java index d92dfdd863..e43a253511 100644 --- a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java +++ b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java @@ -22,6 +22,7 @@ import java.util.Map; import org.jspecify.annotations.NonNull; +import org.jspecify.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.yaml.snakeyaml.Yaml; @@ -84,7 +85,22 @@ public static void main(String[] args) throws IOException { private static @NonNull Metrics initOTLPMetrics() { Map configProperties = loadConfigFromYaml(); - OtlpConfig otlpConfig = key -> configProperties.get(key); + OtlpConfig otlpConfig = new OtlpConfig() { + @Override + public String prefix() { + return ""; + } + + @Override + public @Nullable String get(String key) { + return configProperties.get(key); + } + + @Override + public Map resourceAttributes() { + return Map.of("service.name","josdk","operator","webpage"); + } + }; MeterRegistry registry = new OtlpMeterRegistry(otlpConfig, Clock.SYSTEM); From b40766e992cc9b9a457c38f60b8cc5caf176ef1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Mon, 9 Feb 2026 18:42:54 +0100 Subject: [PATCH 16/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- observability/README.md | 22 +++++++---- observability/install-observability.sh | 2 + .../josdk-operator-metrics-dashboard.json | 34 ++++++++-------- observability/jvm-metrics-dashboard.json | 30 +++++++------- .../operator/sample/WebPageOperator.java | 39 +++++++++++-------- 5 files changed, 70 insertions(+), 57 deletions(-) diff --git a/observability/README.md b/observability/README.md index 9706a466e9..58caae27d0 100644 --- a/observability/README.md +++ b/observability/README.md @@ -61,6 +61,9 @@ Monitors Java Virtual Machine health and performance: - `system.cpu.usage`, `system.cpu.count` - `process.uptime` +**Filtering:** +All panels filter by `service_name="josdk"` to show metrics only from your operator. + ### 2. Java Operator SDK Metrics Dashboard (`josdk-operator-metrics-dashboard.json`) Monitors Kubernetes operator performance and health: @@ -87,6 +90,9 @@ Monitors Kubernetes operator performance and health: - `operator.sdk.events.received`, `.delete` - Event reception - Retry metrics and failure breakdowns +**Filtering:** +All panels filter by `service_name="josdk"` to show metrics only from your operator. + ## Importing Dashboards into Grafana ### Automatic Import (Default) @@ -180,8 +186,8 @@ Open http://localhost:9090/targets and verify the OTLP collector target is UP. ### Verify Metrics in Prometheus Open Prometheus UI and search for metrics: -- JVM metrics: `otel_jvm_*` -- Operator metrics: `otel_operator_sdk_*` +- JVM metrics: `jvm_*` +- Operator metrics: `operator_sdk_*` ### Check Grafana Data Source 1. Navigate to **Configuration** → **Data Sources** @@ -217,25 +223,25 @@ After making changes, re-import the dashboard using one of the methods above. ### JVM Metrics ```promql # Heap memory usage percentage -(otel_jvm_memory_used_bytes{area="heap"} / otel_jvm_memory_max_bytes{area="heap"}) * 100 +(jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"}) * 100 # GC throughput (percentage of time NOT in GC) -100 - (rate(otel_jvm_gc_pause_seconds_sum[5m]) * 100) +100 - (rate(jvm_gc_pause_seconds_sum[5m]) * 100) # Thread count trend -otel_jvm_threads_live_threads +jvm_threads_live_threads ``` ### Operator Metrics ```promql # Reconciliation success rate -rate(otel_operator_sdk_reconciliations_success_total[5m]) / rate(otel_operator_sdk_reconciliations_started_total[5m]) +rate(operator_sdk_reconciliations_success_total[5m]) / rate(operator_sdk_reconciliations_started_total[5m]) # Average reconciliation time -rate(otel_operator_sdk_controllers_execution_reconcile_seconds_sum[5m]) / rate(otel_operator_sdk_controllers_execution_reconcile_seconds_count[5m]) +rate(operator_sdk_controllers_execution_reconcile_seconds_sum[5m]) / rate(operator_sdk_controllers_execution_reconcile_seconds_count[5m]) # Queue saturation -otel_operator_sdk_reconciliations_queue_size / on() group_left() max(otel_operator_sdk_reconciliations_queue_size) +operator_sdk_reconciliations_queue_size / on() group_left() max(operator_sdk_reconciliations_queue_size) ``` ## References diff --git a/observability/install-observability.sh b/observability/install-observability.sh index e724ac54d4..dc7430520b 100755 --- a/observability/install-observability.sh +++ b/observability/install-observability.sh @@ -146,6 +146,8 @@ spec: namespace: "" send_timestamps: true metric_expiration: 5m + resource_to_telemetry_conversion: + enabled: true debug: verbosity: detailed sampling_initial: 5 diff --git a/observability/josdk-operator-metrics-dashboard.json b/observability/josdk-operator-metrics-dashboard.json index 006821a467..6b53d26611 100644 --- a/observability/josdk-operator-metrics-dashboard.json +++ b/observability/josdk-operator-metrics-dashboard.json @@ -103,7 +103,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_reconciliations_started_total{job=\"webpage-operator\"}[5m])) by (kind, version)", + "expr": "sum(rate(operator_sdk_reconciliations_started_total{service_name=\"josdk\"}[5m])) by (kind, version)", "legendFormat": "{{kind}} ({{version}})", "range": true, "refId": "A" @@ -224,7 +224,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_reconciliations_success_total{job=\"webpage-operator\"}[5m]))", + "expr": "sum(rate(operator_sdk_reconciliations_success_total{service_name=\"josdk\"}[5m]))", "legendFormat": "Success", "range": true, "refId": "A" @@ -235,7 +235,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_reconciliations_failed_total{job=\"webpage-operator\"}[5m]))", + "expr": "sum(rate(operator_sdk_reconciliations_failed_total{service_name=\"josdk\"}[5m]))", "legendFormat": "Failure", "range": true, "refId": "B" @@ -302,7 +302,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(otel_operator_sdk_reconciliations_executions{job=\"webpage-operator\"})", + "expr": "sum(operator_sdk_reconciliations_executions{service_name=\"josdk\"})", "legendFormat": "Executing", "range": true, "refId": "A" @@ -369,7 +369,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(otel_operator_sdk_reconciliations_queue_size{job=\"webpage-operator\"})", + "expr": "sum(operator_sdk_reconciliations_queue_size{service_name=\"josdk\"})", "legendFormat": "Queue Size", "range": true, "refId": "A" @@ -430,7 +430,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(otel_operator_sdk_reconciliations_started_total{job=\"webpage-operator\"})", + "expr": "sum(operator_sdk_reconciliations_started_total{service_name=\"josdk\"})", "legendFormat": "Total", "range": true, "refId": "A" @@ -495,7 +495,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_reconciliations_failed_total{job=\"webpage-operator\"}[5m]))", + "expr": "sum(rate(operator_sdk_reconciliations_failed_total{service_name=\"josdk\"}[5m]))", "legendFormat": "Error Rate", "range": true, "refId": "A" @@ -585,7 +585,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.50, sum(rate(otel_operator_sdk_controllers_execution_reconcile_seconds_bucket{job=\"webpage-operator\"}[5m])) by (le, controller))", + "expr": "histogram_quantile(0.50, sum(rate(operator_sdk_controllers_execution_reconcile_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller))", "legendFormat": "p50 - {{controller}}", "range": true, "refId": "A" @@ -596,7 +596,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.95, sum(rate(otel_operator_sdk_controllers_execution_reconcile_seconds_bucket{job=\"webpage-operator\"}[5m])) by (le, controller))", + "expr": "histogram_quantile(0.95, sum(rate(operator_sdk_controllers_execution_reconcile_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller))", "legendFormat": "p95 - {{controller}}", "range": true, "refId": "B" @@ -607,7 +607,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.99, sum(rate(otel_operator_sdk_controllers_execution_reconcile_seconds_bucket{job=\"webpage-operator\"}[5m])) by (le, controller))", + "expr": "histogram_quantile(0.99, sum(rate(operator_sdk_controllers_execution_reconcile_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller))", "legendFormat": "p99 - {{controller}}", "range": true, "refId": "C" @@ -697,7 +697,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_events_received_total{job=\"webpage-operator\"}[5m])) by (event, action)", + "expr": "sum(rate(operator_sdk_events_received_total{service_name=\"josdk\"}[5m])) by (event, action)", "legendFormat": "{{event}} - {{action}}", "range": true, "refId": "A" @@ -787,7 +787,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_reconciliations_failed_total{job=\"webpage-operator\"}[5m])) by (exception)", + "expr": "sum(rate(operator_sdk_reconciliations_failed_total{service_name=\"josdk\"}[5m])) by (exception)", "legendFormat": "{{exception}}", "range": true, "refId": "A" @@ -877,7 +877,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_controllers_execution_reconcile_success_total{job=\"webpage-operator\"}[5m])) by (type)", + "expr": "sum(rate(operator_sdk_controllers_execution_reconcile_success_total{service_name=\"josdk\"}[5m])) by (type)", "legendFormat": "Success - {{type}}", "range": true, "refId": "A" @@ -888,7 +888,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_controllers_execution_reconcile_failure_total{job=\"webpage-operator\"}[5m])) by (exception)", + "expr": "sum(rate(operator_sdk_controllers_execution_reconcile_failure_total{service_name=\"josdk\"}[5m])) by (exception)", "legendFormat": "Failure - {{exception}}", "range": true, "refId": "B" @@ -978,7 +978,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_events_delete_total{job=\"webpage-operator\"}[5m])) by (kind, version)", + "expr": "sum(rate(operator_sdk_events_delete_total{service_name=\"josdk\"}[5m])) by (kind, version)", "legendFormat": "{{kind}} ({{version}})", "range": true, "refId": "A" @@ -1068,7 +1068,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_reconciliations_started_total{job=\"webpage-operator\", operator_sdk_reconciliations_retries_last=\"true\"}[5m]))", + "expr": "sum(rate(operator_sdk_reconciliations_started_total{service_name=\"josdk\", operator_sdk_reconciliations_retries_last=\"true\"}[5m]))", "legendFormat": "Last Retry Attempts", "range": true, "refId": "A" @@ -1079,7 +1079,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(otel_operator_sdk_reconciliations_started_total{job=\"webpage-operator\", operator_sdk_reconciliations_retries_last=\"false\"}[5m]))", + "expr": "sum(rate(operator_sdk_reconciliations_started_total{service_name=\"josdk\", operator_sdk_reconciliations_retries_last=\"false\"}[5m]))", "legendFormat": "Retries (Not Last)", "range": true, "refId": "B" diff --git a/observability/jvm-metrics-dashboard.json b/observability/jvm-metrics-dashboard.json index 0a817aa09c..528f29674e 100644 --- a/observability/jvm-metrics-dashboard.json +++ b/observability/jvm-metrics-dashboard.json @@ -106,7 +106,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "otel_jvm_memory_used_bytes{job=\"webpage-operator\"}", + "expr": "jvm_memory_used_bytes{service_name=\"josdk\"}", "legendFormat": "{{area}} - {{id}}", "range": true, "refId": "A" @@ -195,7 +195,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "otel_jvm_threads_live_threads{job=\"webpage-operator\"}", + "expr": "jvm_threads_live{service_name=\"josdk\"}", "legendFormat": "Live Threads", "range": true, "refId": "A" @@ -206,7 +206,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "otel_jvm_threads_daemon_threads{job=\"webpage-operator\"}", + "expr": "jvm_threads_daemon_threads{service_name=\"josdk\"}", "legendFormat": "Daemon Threads", "range": true, "refId": "B" @@ -217,7 +217,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "otel_jvm_threads_peak_threads{job=\"webpage-operator\"}", + "expr": "jvm_threads_peak_threads{service_name=\"josdk\"}", "legendFormat": "Peak Threads", "range": true, "refId": "C" @@ -306,7 +306,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(otel_jvm_gc_pause_seconds_sum{job=\"webpage-operator\"}[5m])", + "expr": "rate(jvm_gc_pause_milliseconds_sum{service_name=\"josdk\"}[5m])", "legendFormat": "{{action}} - {{cause}}", "range": true, "refId": "A" @@ -395,7 +395,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(otel_jvm_gc_pause_seconds_count{job=\"webpage-operator\"}[5m])", + "expr": "rate(jvm_gc_pause_milliseconds_count{service_name=\"josdk\"}[5m])", "legendFormat": "{{action}} - {{cause}}", "range": true, "refId": "A" @@ -453,7 +453,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "otel_system_cpu_usage{job=\"webpage-operator\"}", + "expr": "system_cpu_usage{service_name=\"josdk\"}", "legendFormat": "CPU Usage", "range": true, "refId": "A" @@ -511,7 +511,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "otel_jvm_classes_loaded_classes{job=\"webpage-operator\"}", + "expr": "jvm_classes_loaded{service_name=\"josdk\"}", "legendFormat": "Classes Loaded", "range": true, "refId": "A" @@ -540,7 +540,7 @@ } ] }, - "unit": "s" + "unit": "ms" }, "overrides": [] }, @@ -569,7 +569,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "otel_process_uptime_seconds{job=\"webpage-operator\"}", + "expr": "process_uptime_milliseconds{service_name=\"josdk\"}", "legendFormat": "Uptime", "range": true, "refId": "A" @@ -627,7 +627,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "otel_system_cpu_count{job=\"webpage-operator\"}", + "expr": "system_cpu_count{service_name=\"josdk\"}", "legendFormat": "CPU Count", "range": true, "refId": "A" @@ -716,7 +716,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(otel_jvm_gc_memory_allocated_bytes_total{job=\"webpage-operator\"}[5m])", + "expr": "rate(jvm_gc_memory_allocated_bytes_total{service_name=\"josdk\"}[5m])", "legendFormat": "Allocated", "range": true, "refId": "A" @@ -727,7 +727,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(otel_jvm_gc_memory_promoted_bytes_total{job=\"webpage-operator\"}[5m])", + "expr": "rate(jvm_gc_memory_promoted_bytes_total{service_name=\"josdk\"}[5m])", "legendFormat": "Promoted", "range": true, "refId": "B" @@ -816,7 +816,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "otel_jvm_memory_max_bytes{job=\"webpage-operator\", area=\"heap\"}", + "expr": "jvm_memory_max_bytes{service_name=\"josdk\", area=\"heap\"}", "legendFormat": "Max Heap", "range": true, "refId": "A" @@ -827,7 +827,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "otel_jvm_memory_committed_bytes{job=\"webpage-operator\", area=\"heap\"}", + "expr": "jvm_memory_committed_bytes{service_name=\"josdk\", area=\"heap\"}", "legendFormat": "Committed Heap", "range": true, "refId": "B" diff --git a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java index e43a253511..ad580736c1 100644 --- a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java +++ b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java @@ -58,7 +58,10 @@ public class WebPageOperator { public static void main(String[] args) throws IOException { log.info("WebServer Operator starting!"); - // TODO remove otel prefix, add job and additional labels?! + // TODO // todo change: + // operator_sdk_reconciliations_queue_size_webpagestandalonedependentsreconciler + // operator_sdk_reconciliations_executions_webpagestandalonedependentsreconciler + // => controller name as label // TODO add test for checking if there are metrics in prometheus // Load configuration from config.yaml Metrics metrics = initOTLPMetrics(); @@ -85,22 +88,24 @@ public static void main(String[] args) throws IOException { private static @NonNull Metrics initOTLPMetrics() { Map configProperties = loadConfigFromYaml(); - OtlpConfig otlpConfig = new OtlpConfig() { - @Override - public String prefix() { - return ""; - } - - @Override - public @Nullable String get(String key) { - return configProperties.get(key); - } - - @Override - public Map resourceAttributes() { - return Map.of("service.name","josdk","operator","webpage"); - } - }; + var otlpConfig = + new OtlpConfig() { + @Override + public String prefix() { + return ""; + } + + @Override + public @Nullable String get(String key) { + return configProperties.get(key); + } + + // these should come from env variables + @Override + public Map resourceAttributes() { + return Map.of("service.name", "josdk", "operator", "webpage"); + } + }; MeterRegistry registry = new OtlpMeterRegistry(otlpConfig, Clock.SYSTEM); From 6efffd6cb172dc6102a249c0046708be9d42b282 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Mon, 9 Feb 2026 21:48:28 +0100 Subject: [PATCH 17/25] improve: micrometer metrics improvements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- .../micrometer/MicrometerMetrics.java | 40 ++++++++++++------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java index 7beabb7a6e..cd0572db7b 100644 --- a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java +++ b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java @@ -22,6 +22,8 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; +import org.jspecify.annotations.NonNull; + import io.fabric8.kubernetes.api.model.HasMetadata; import io.javaoperatorsdk.operator.OperatorException; import io.javaoperatorsdk.operator.api.monitoring.Metrics; @@ -37,8 +39,6 @@ import io.micrometer.core.instrument.Tag; import io.micrometer.core.instrument.Timer; -import static io.javaoperatorsdk.operator.api.reconciler.Constants.CONTROLLER_NAME; - public class MicrometerMetrics implements Metrics { private static final String PREFIX = "operator.sdk."; @@ -48,8 +48,8 @@ public class MicrometerMetrics implements Metrics { private static final String RECONCILIATIONS_RETRIES_LAST = RECONCILIATIONS + "retries.last"; private static final String RECONCILIATIONS_RETRIES_NUMBER = RECONCILIATIONS + "retries.number"; private static final String RECONCILIATIONS_STARTED = RECONCILIATIONS + "started"; - private static final String RECONCILIATIONS_EXECUTIONS = PREFIX + RECONCILIATIONS + "executions."; - private static final String RECONCILIATIONS_QUEUE_SIZE = PREFIX + RECONCILIATIONS + "queue.size."; + private static final String RECONCILIATIONS_EXECUTIONS = PREFIX + RECONCILIATIONS + "executions"; + private static final String RECONCILIATIONS_QUEUE_SIZE = PREFIX + RECONCILIATIONS + "queue.size"; private static final String NAME = "name"; private static final String NAMESPACE = "namespace"; private static final String GROUP = "group"; @@ -59,6 +59,7 @@ public class MicrometerMetrics implements Metrics { private static final String METADATA_PREFIX = "resource."; private static final String CONTROLLERS_EXECUTION = "controllers.execution."; private static final String CONTROLLER = "controller"; + private static final String CONTROLLER_NAME = CONTROLLER + ".name"; private static final String SUCCESS_SUFFIX = ".success"; private static final String FAILURE_SUFFIX = ".failure"; private static final String TYPE = "type"; @@ -130,18 +131,27 @@ private MicrometerMetrics( public void controllerRegistered(Controller controller) { final var configuration = controller.getConfiguration(); final var name = configuration.getName(); - final var executingThreadsName = RECONCILIATIONS_EXECUTIONS + name; + final var executingThreadsRefName = reconciliationExecutionGaugeRefName(name); final var resourceClass = configuration.getResourceClass(); - final var tags = new ArrayList(3); + final var tags = new ArrayList(); + tags.add(Tag.of(CONTROLLER_NAME, name)); addGVKTags(GroupVersionKind.gvkFor(resourceClass), tags, false); AtomicInteger executingThreads = - registry.gauge(executingThreadsName, tags, new AtomicInteger(0)); - gauges.put(executingThreadsName, executingThreads); + registry.gauge(RECONCILIATIONS_EXECUTIONS, tags, new AtomicInteger(0)); + gauges.put(executingThreadsRefName, executingThreads); - final var controllerQueueName = RECONCILIATIONS_QUEUE_SIZE + name; + final var controllerQueueRefName = controllerQueueSizeGaugeRefName(name); AtomicInteger controllerQueueSize = - registry.gauge(controllerQueueName, tags, new AtomicInteger(0)); - gauges.put(controllerQueueName, controllerQueueSize); + registry.gauge(RECONCILIATIONS_QUEUE_SIZE, tags, new AtomicInteger(0)); + gauges.put(controllerQueueRefName, controllerQueueSize); + } + + private static @NonNull String reconciliationExecutionGaugeRefName(String controllerName) { + return RECONCILIATIONS_EXECUTIONS + "." + controllerName; + } + + private static @NonNull String controllerQueueSizeGaugeRefName(String controllerName) { + return RECONCILIATIONS_QUEUE_SIZE + "." + controllerName; } @Override @@ -223,7 +233,7 @@ public void reconcileCustomResource( String.valueOf(retryInfo.map(RetryInfo::isLastAttempt).orElse(true)))); var controllerQueueSize = - gauges.get(RECONCILIATIONS_QUEUE_SIZE + metadata.get(CONTROLLER_NAME)); + gauges.get(controllerQueueSizeGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); controllerQueueSize.incrementAndGet(); } @@ -235,18 +245,18 @@ public void finishedReconciliation(HasMetadata resource, Map met @Override public void reconciliationExecutionStarted(HasMetadata resource, Map metadata) { var reconcilerExecutions = - gauges.get(RECONCILIATIONS_EXECUTIONS + metadata.get(CONTROLLER_NAME)); + gauges.get(reconciliationExecutionGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); reconcilerExecutions.incrementAndGet(); } @Override public void reconciliationExecutionFinished(HasMetadata resource, Map metadata) { var reconcilerExecutions = - gauges.get(RECONCILIATIONS_EXECUTIONS + metadata.get(CONTROLLER_NAME)); + gauges.get(reconciliationExecutionGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); reconcilerExecutions.decrementAndGet(); var controllerQueueSize = - gauges.get(RECONCILIATIONS_QUEUE_SIZE + metadata.get(CONTROLLER_NAME)); + gauges.get(controllerQueueSizeGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); controllerQueueSize.decrementAndGet(); } From 4f38ca9d4e5b9be38ae41f185737cabd1b2837f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Tue, 10 Feb 2026 10:12:14 +0100 Subject: [PATCH 18/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- .../micrometer/MicrometerMetrics.java | 58 +++++++++++++++---- .../api/monitoring/AggregatedMetrics.java | 5 +- .../operator/api/monitoring/Metrics.java | 3 +- .../processing/event/EventProcessor.java | 2 +- .../api/monitoring/AggregatedMetricsTest.java | 10 ++-- 5 files changed, 57 insertions(+), 21 deletions(-) diff --git a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java index cd0572db7b..94391bec82 100644 --- a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java +++ b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java @@ -221,16 +221,18 @@ public void cleanupDoneFor(ResourceID resourceID, Map metadata) public void reconcileCustomResource( HasMetadata resource, RetryInfo retryInfoNullable, Map metadata) { Optional retryInfo = Optional.ofNullable(retryInfoNullable); - incrementCounter( - ResourceID.fromResource(resource), - RECONCILIATIONS_STARTED, - metadata, - Tag.of( - RECONCILIATIONS_RETRIES_NUMBER, - String.valueOf(retryInfo.map(RetryInfo::getAttemptCount).orElse(0))), - Tag.of( - RECONCILIATIONS_RETRIES_LAST, - String.valueOf(retryInfo.map(RetryInfo::isLastAttempt).orElse(true)))); + ResourceID resourceID = ResourceID.fromResource(resource); + + // Record the counter without retry tags + incrementCounter(resourceID, RECONCILIATIONS_STARTED, metadata); + + // Update retry number gauge + int retryNumber = retryInfo.map(RetryInfo::getAttemptCount).orElse(0); + updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_NUMBER, retryNumber); + + // Update retry last attempt gauge (1 for true, 0 for false) + int isLastAttempt = retryInfo.map(RetryInfo::isLastAttempt).orElse(true) ? 1 : 0; + updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_LAST, isLastAttempt); var controllerQueueSize = gauges.get(controllerQueueSizeGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); @@ -238,8 +240,14 @@ public void reconcileCustomResource( } @Override - public void finishedReconciliation(HasMetadata resource, Map metadata) { - incrementCounter(ResourceID.fromResource(resource), RECONCILIATIONS_SUCCESS, metadata); + public void successfullyFinishedReconciliation( + HasMetadata resource, Map metadata) { + ResourceID resourceID = ResourceID.fromResource(resource); + incrementCounter(resourceID, RECONCILIATIONS_SUCCESS, metadata); + + // Reset retry gauges on successful reconciliation + updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_NUMBER, 0); + updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_LAST, 0); } @Override @@ -335,6 +343,32 @@ private void incrementCounter( counter.increment(); } + private void updateGauge( + ResourceID id, Map metadata, String gaugeName, int value) { + final var tags = new ArrayList(6); + addMetadataTags(id, metadata, tags, false); + + final var gaugeRefName = buildGaugeRefName(id, gaugeName); + AtomicInteger gauge = + gauges.computeIfAbsent( + gaugeRefName, + key -> { + AtomicInteger newGauge = + registry.gauge(PREFIX + gaugeName, tags, new AtomicInteger(0)); + // Find the meter in the registry and record it for cleanup + var meter = registry.find(PREFIX + gaugeName).tags(tags).gauge(); + if (meter != null) { + cleaner.recordAssociation(id, meter); + } + return newGauge; + }); + gauge.set(value); + } + + private String buildGaugeRefName(ResourceID id, String gaugeName) { + return gaugeName + "." + id.getName() + "." + id.getNamespace().orElse(CLUSTER); + } + protected Set recordedMeterIdsFor(ResourceID resourceID) { return cleaner.recordedMeterIdsFor(resourceID); } diff --git a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetrics.java b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetrics.java index f66bdc47c6..4e3540bf55 100644 --- a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetrics.java +++ b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetrics.java @@ -103,8 +103,9 @@ public void cleanupDoneFor(ResourceID resourceID, Map metadata) } @Override - public void finishedReconciliation(HasMetadata resource, Map metadata) { - metricsList.forEach(metrics -> metrics.finishedReconciliation(resource, metadata)); + public void successfullyFinishedReconciliation( + HasMetadata resource, Map metadata) { + metricsList.forEach(metrics -> metrics.successfullyFinishedReconciliation(resource, metadata)); } @Override diff --git a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java index 10b2db6774..cda6fd167b 100644 --- a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java +++ b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java @@ -93,7 +93,8 @@ default void cleanupDoneFor(ResourceID resourceID, Map metadata) * @param resource the {@link ResourceID} associated with the resource being processed * @param metadata metadata associated with the resource being processed */ - default void finishedReconciliation(HasMetadata resource, Map metadata) {} + default void successfullyFinishedReconciliation( + HasMetadata resource, Map metadata) {} /** * Encapsulates the information about a controller execution i.e. a call to either {@link diff --git a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/event/EventProcessor.java b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/event/EventProcessor.java index b476c39614..4ff482f03e 100644 --- a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/event/EventProcessor.java +++ b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/event/EventProcessor.java @@ -292,7 +292,7 @@ synchronized void eventProcessingFinished( return; } cleanupOnSuccessfulExecution(executionScope); - metrics.finishedReconciliation(executionScope.getResource(), metricsMetadata); + metrics.successfullyFinishedReconciliation(executionScope.getResource(), metricsMetadata); if ((triggerOnAllEvents() && executionScope.isDeleteEvent()) || (!triggerOnAllEvents() && state.deleteEventPresent())) { cleanupForDeletedEvent(executionScope.getResourceID()); diff --git a/operator-framework-core/src/test/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetricsTest.java b/operator-framework-core/src/test/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetricsTest.java index 68142048b6..36a3ca0877 100644 --- a/operator-framework-core/src/test/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetricsTest.java +++ b/operator-framework-core/src/test/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetricsTest.java @@ -141,13 +141,13 @@ void cleanupDoneFor_shouldDelegateToAllMetricsInOrder() { } @Test - void finishedReconciliation_shouldDelegateToAllMetricsInOrder() { - aggregatedMetrics.finishedReconciliation(resource, metadata); + void successfullyFinishedReconciliation_shouldDelegateToAllMetricsInOrder() { + aggregatedMetrics.successfullyFinishedReconciliation(resource, metadata); final var inOrder = inOrder(metrics1, metrics2, metrics3); - inOrder.verify(metrics1).finishedReconciliation(resource, metadata); - inOrder.verify(metrics2).finishedReconciliation(resource, metadata); - inOrder.verify(metrics3).finishedReconciliation(resource, metadata); + inOrder.verify(metrics1).successfullyFinishedReconciliation(resource, metadata); + inOrder.verify(metrics2).successfullyFinishedReconciliation(resource, metadata); + inOrder.verify(metrics3).successfullyFinishedReconciliation(resource, metadata); verifyNoMoreInteractions(metrics1, metrics2, metrics3); } From 87fd2995a71008b49f2ee16318d38c426946db17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Tue, 10 Feb 2026 14:17:37 +0100 Subject: [PATCH 19/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- .../micrometer/MicrometerMetrics.java | 69 ++++++++-- .../josdk-operator-metrics-dashboard.json | 123 ++++++++++++++++-- 2 files changed, 169 insertions(+), 23 deletions(-) diff --git a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java index 94391bec82..0886c46fc6 100644 --- a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java +++ b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java @@ -21,6 +21,7 @@ import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Consumer; import org.jspecify.annotations.NonNull; @@ -75,6 +76,7 @@ public class MicrometerMetrics implements Metrics { private final MeterRegistry registry; private final Map gauges = new ConcurrentHashMap<>(); private final Cleaner cleaner; + private final Consumer timerConfig; /** * Creates a MicrometerMetrics instance configured to not collect per-resource metrics, just @@ -84,7 +86,7 @@ public class MicrometerMetrics implements Metrics { * @return a MicrometerMetrics instance configured to not collect per-resource metrics */ public static MicrometerMetrics withoutPerResourceMetrics(MeterRegistry registry) { - return new MicrometerMetrics(registry, Cleaner.NOOP, false); + return new MicrometerMetrics(registry, Cleaner.NOOP, false, null); } /** @@ -108,7 +110,7 @@ public static MicrometerMetricsBuilder newMicrometerMetricsBuilder(MeterRegistry */ public static PerResourceCollectingMicrometerMetricsBuilder newPerResourceCollectingMicrometerMetricsBuilder(MeterRegistry registry) { - return new PerResourceCollectingMicrometerMetricsBuilder(registry); + return new PerResourceCollectingMicrometerMetricsBuilder(registry, null); } /** @@ -119,12 +121,21 @@ public static MicrometerMetricsBuilder newMicrometerMetricsBuilder(MeterRegistry * @param registry the {@link MeterRegistry} instance to use for metrics recording * @param cleaner the {@link Cleaner} to use * @param collectingPerResourceMetrics whether to collect per resource metrics + * @param timerConfig optional configuration for timers, defaults to publishing percentiles 0.5, + * 0.95, 0.99 and histogram */ private MicrometerMetrics( - MeterRegistry registry, Cleaner cleaner, boolean collectingPerResourceMetrics) { + MeterRegistry registry, + Cleaner cleaner, + boolean collectingPerResourceMetrics, + Consumer timerConfig) { this.registry = registry; this.cleaner = cleaner; this.collectPerResourceMetrics = collectingPerResourceMetrics; + this.timerConfig = + timerConfig != null + ? timerConfig + : builder -> builder.publishPercentiles(0.5, 0.95, 0.99).publishPercentileHistogram(); } @Override @@ -163,12 +174,9 @@ public T timeControllerExecution(ControllerExecution execution) { final var tags = new ArrayList(16); tags.add(Tag.of(CONTROLLER, name)); addMetadataTags(resourceID, metadata, tags, true); - final var timer = - Timer.builder(execName) - .tags(tags) - .publishPercentiles(0.3, 0.5, 0.95) - .publishPercentileHistogram() - .register(registry); + final var timerBuilder = Timer.builder(execName).tags(tags); + timerConfig.accept(timerBuilder); + final var timer = timerBuilder.register(registry); try { final var result = timer.record( @@ -379,8 +387,27 @@ public static class PerResourceCollectingMicrometerMetricsBuilder private int cleaningThreadsNumber; private int cleanUpDelayInSeconds; - private PerResourceCollectingMicrometerMetricsBuilder(MeterRegistry registry) { + private PerResourceCollectingMicrometerMetricsBuilder( + MeterRegistry registry, Consumer timerConfig) { super(registry); + this.executionTimerConfig = timerConfig; + } + + /** + * Configures the Timer used for timing controller executions. By default, timers are configured + * to publish percentiles 0.5, 0.95, 0.99 and a percentile histogram. You can set: {@code + * .minimumExpectedValue(Duration.ofMillis(...)).maximumExpectedValue(Duration.ofSeconds(...)) } + * so micrometer can create the buckets for you. + * + * @param executionTimerConfig a consumer that will configure the Timer.Builder. The builder + * will already have the metric name and tags set. + * @return this builder for method chaining + */ + @Override + public PerResourceCollectingMicrometerMetricsBuilder withExecutionTimerConfig( + Consumer executionTimerConfig) { + this.executionTimerConfig = executionTimerConfig; + return this; } /** @@ -412,23 +439,38 @@ public PerResourceCollectingMicrometerMetricsBuilder withCleanUpDelayInSeconds( public MicrometerMetrics build() { final var cleaner = new DelayedCleaner(registry, cleanUpDelayInSeconds, cleaningThreadsNumber); - return new MicrometerMetrics(registry, cleaner, true); + return new MicrometerMetrics(registry, cleaner, true, executionTimerConfig); } } public static class MicrometerMetricsBuilder { protected final MeterRegistry registry; private boolean collectingPerResourceMetrics = true; + protected Consumer executionTimerConfig = null; private MicrometerMetricsBuilder(MeterRegistry registry) { this.registry = registry; } + /** + * Configures the Timer used for timing controller executions. By default, timers are configured + * to publish percentiles 0.5, 0.95, 0.99 and a percentile histogram. + * + * @param executionTimerConfig a consumer that will configure the Timer.Builder. The builder + * will already have the metric name and tags set. + * @return this builder for method chaining + */ + public MicrometerMetricsBuilder withExecutionTimerConfig( + Consumer executionTimerConfig) { + this.executionTimerConfig = executionTimerConfig; + return this; + } + /** Configures the instance to collect metrics on a per-resource basis. */ @SuppressWarnings("unused") public PerResourceCollectingMicrometerMetricsBuilder collectingMetricsPerResource() { collectingPerResourceMetrics = true; - return new PerResourceCollectingMicrometerMetricsBuilder(registry); + return new PerResourceCollectingMicrometerMetricsBuilder(registry, executionTimerConfig); } /** @@ -442,7 +484,8 @@ public MicrometerMetricsBuilder notCollectingMetricsPerResource() { } public MicrometerMetrics build() { - return new MicrometerMetrics(registry, Cleaner.NOOP, collectingPerResourceMetrics); + return new MicrometerMetrics( + registry, Cleaner.NOOP, collectingPerResourceMetrics, executionTimerConfig); } } diff --git a/observability/josdk-operator-metrics-dashboard.json b/observability/josdk-operator-metrics-dashboard.json index 6b53d26611..6b6236cd2b 100644 --- a/observability/josdk-operator-metrics-dashboard.json +++ b/observability/josdk-operator-metrics-dashboard.json @@ -992,7 +992,7 @@ "type": "prometheus", "uid": "prometheus" }, - "description": "Reconciliation retry information", + "description": "Current retry attempt number for resources being retried", "fieldConfig": { "defaults": { "color": { @@ -1018,7 +1018,7 @@ "scaleDistribution": { "type": "linear" }, - "showPoints": "never", + "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", @@ -1035,10 +1035,18 @@ { "color": "green", "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 3 } ] }, - "unit": "ops" + "unit": "short" }, "overrides": [] }, @@ -1051,7 +1059,7 @@ "id": 12, "options": { "legend": { - "calcs": ["last", "mean"], + "calcs": ["last", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true @@ -1068,24 +1076,119 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(operator_sdk_reconciliations_started_total{service_name=\"josdk\", operator_sdk_reconciliations_retries_last=\"true\"}[5m]))", - "legendFormat": "Last Retry Attempts", + "expr": "operator_sdk_reconciliations_retries_number{service_name=\"josdk\"}", + "legendFormat": "{{kind}}/{{name}} ({{namespace}})", "range": true, "refId": "A" + } + ], + "title": "Reconciliation Retry Attempts", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Resources currently on their last retry attempt (1 = last attempt, 0 = not last or no retry)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "stepAfter", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [ + { + "options": { + "0": { + "text": "No" + }, + "1": { + "text": "Yes" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 13, + "options": { + "legend": { + "calcs": ["last"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(operator_sdk_reconciliations_started_total{service_name=\"josdk\", operator_sdk_reconciliations_retries_last=\"false\"}[5m]))", - "legendFormat": "Retries (Not Last)", + "expr": "operator_sdk_reconciliations_retries_last{service_name=\"josdk\"}", + "legendFormat": "{{kind}}/{{name}} ({{namespace}})", "range": true, - "refId": "B" + "refId": "A" } ], - "title": "Reconciliation Retry Rate", + "title": "Resources on Last Retry Attempt", "type": "timeseries" } ], From 98b260a9fab1883b1e9a5eabbea898b834a1d6ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Tue, 10 Feb 2026 18:03:32 +0100 Subject: [PATCH 20/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- .../micrometer/MicrometerMetrics.java | 202 +++--------------- .../AbstractMicrometerMetricsTestFixture.java | 17 -- .../micrometer/DefaultBehaviorIT.java | 23 -- .../DelayedMetricsCleaningOnDeleteIT.java | 46 ---- .../micrometer/NoPerResourceCollectionIT.java | 15 -- .../josdk-operator-metrics-dashboard.json | 126 +---------- .../operator/api/monitoring/Metrics.java | 11 - .../operator/processing/Controller.java | 22 -- sample-operators/webpage/k8s/webpage2.yaml | 34 +++ .../operator/sample/WebPageOperator.java | 56 +++-- .../webpage/src/main/resources/log4j2.xml | 2 +- 11 files changed, 119 insertions(+), 435 deletions(-) delete mode 100644 micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/DelayedMetricsCleaningOnDeleteIT.java create mode 100644 sample-operators/webpage/k8s/webpage2.yaml diff --git a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java index 0886c46fc6..45f1517864 100644 --- a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java +++ b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java @@ -17,9 +17,6 @@ import java.util.*; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Consumer; @@ -35,22 +32,23 @@ import io.javaoperatorsdk.operator.processing.event.Event; import io.javaoperatorsdk.operator.processing.event.ResourceID; import io.javaoperatorsdk.operator.processing.event.source.controller.ResourceEvent; -import io.micrometer.core.instrument.Meter; import io.micrometer.core.instrument.MeterRegistry; import io.micrometer.core.instrument.Tag; import io.micrometer.core.instrument.Timer; public class MicrometerMetrics implements Metrics { + private static final String SUCCESS_SUFFIX = "success"; + private static final String FAILURE_SUFFIX = "failure"; private static final String PREFIX = "operator.sdk."; private static final String RECONCILIATIONS = "reconciliations."; - private static final String RECONCILIATIONS_FAILED = RECONCILIATIONS + "failed"; - private static final String RECONCILIATIONS_SUCCESS = RECONCILIATIONS + "success"; - private static final String RECONCILIATIONS_RETRIES_LAST = RECONCILIATIONS + "retries.last"; - private static final String RECONCILIATIONS_RETRIES_NUMBER = RECONCILIATIONS + "retries.number"; - private static final String RECONCILIATIONS_STARTED = RECONCILIATIONS + "started"; + private static final String RECONCILIATIONS_FAILED = PREFIX + RECONCILIATIONS + FAILURE_SUFFIX; + private static final String RECONCILIATIONS_SUCCESS = PREFIX + RECONCILIATIONS + SUCCESS_SUFFIX; + private static final String RECONCILIATIONS_RETRIES_NUMBER = + PREFIX + RECONCILIATIONS + "retries.number"; + private static final String RECONCILIATIONS_STARTED = PREFIX + RECONCILIATIONS + "started"; private static final String RECONCILIATIONS_EXECUTIONS = PREFIX + RECONCILIATIONS + "executions"; - private static final String RECONCILIATIONS_QUEUE_SIZE = PREFIX + RECONCILIATIONS + "queue.size"; + private static final String RECONCILIATIONS_QUEUE_SIZE = PREFIX + RECONCILIATIONS + "active"; private static final String NAME = "name"; private static final String NAMESPACE = "namespace"; private static final String GROUP = "group"; @@ -58,24 +56,25 @@ public class MicrometerMetrics implements Metrics { private static final String KIND = "kind"; private static final String SCOPE = "scope"; private static final String METADATA_PREFIX = "resource."; - private static final String CONTROLLERS_EXECUTION = "controllers.execution."; + private static final String CONTROLLERS = "controllers."; + private static final String RECONCILIATION_EXECUTION_TIME = + PREFIX + RECONCILIATIONS + "execution" + ".duration"; + private static final String CONTROLLERS_SUCCESSFUL_EXECUTION = + PREFIX + CONTROLLERS + SUCCESS_SUFFIX; + private static final String CONTROLLERS_FAILED_EXECUTION = PREFIX + CONTROLLERS + FAILURE_SUFFIX; private static final String CONTROLLER = "controller"; private static final String CONTROLLER_NAME = CONTROLLER + ".name"; - private static final String SUCCESS_SUFFIX = ".success"; - private static final String FAILURE_SUFFIX = ".failure"; - private static final String TYPE = "type"; - private static final String EXCEPTION = "exception"; private static final String EVENT = "event"; private static final String ACTION = "action"; - private static final String EVENTS_RECEIVED = "events.received"; - private static final String EVENTS_DELETE = "events.delete"; + private static final String EVENTS_RECEIVED = PREFIX + "events.received"; + private static final String EVENTS_DELETE = PREFIX + "events.delete"; private static final String CLUSTER = "cluster"; private static final String SIZE_SUFFIX = ".size"; private static final String UNKNOWN_ACTION = "UNKNOWN"; private final boolean collectPerResourceMetrics; private final MeterRegistry registry; + // todo double check if we actually need this private final Map gauges = new ConcurrentHashMap<>(); - private final Cleaner cleaner; private final Consumer timerConfig; /** @@ -86,7 +85,7 @@ public class MicrometerMetrics implements Metrics { * @return a MicrometerMetrics instance configured to not collect per-resource metrics */ public static MicrometerMetrics withoutPerResourceMetrics(MeterRegistry registry) { - return new MicrometerMetrics(registry, Cleaner.NOOP, false, null); + return new MicrometerMetrics(registry, false, null); } /** @@ -113,24 +112,21 @@ public static MicrometerMetricsBuilder newMicrometerMetricsBuilder(MeterRegistry return new PerResourceCollectingMicrometerMetricsBuilder(registry, null); } + // todo as v2 class + // todo make backwards compatible /** - * Creates a micrometer-based Metrics implementation that cleans up {@link Meter}s associated with - * deleted resources as specified by the (possibly {@code null}) provided {@link Cleaner} - * instance. + * Creates a micrometer-based Metrics implementation. * * @param registry the {@link MeterRegistry} instance to use for metrics recording - * @param cleaner the {@link Cleaner} to use * @param collectingPerResourceMetrics whether to collect per resource metrics * @param timerConfig optional configuration for timers, defaults to publishing percentiles 0.5, * 0.95, 0.99 and histogram */ private MicrometerMetrics( MeterRegistry registry, - Cleaner cleaner, boolean collectingPerResourceMetrics, Consumer timerConfig) { this.registry = registry; - this.cleaner = cleaner; this.collectPerResourceMetrics = collectingPerResourceMetrics; this.timerConfig = timerConfig != null @@ -165,16 +161,16 @@ public void controllerRegistered(Controller controller) { return RECONCILIATIONS_QUEUE_SIZE + "." + controllerName; } + // todo does it make sense to have both controller and reconciler execution counters? @Override public T timeControllerExecution(ControllerExecution execution) { final var name = execution.controllerName(); - final var execName = PREFIX + CONTROLLERS_EXECUTION + execution.name(); final var resourceID = execution.resourceID(); final var metadata = execution.metadata(); final var tags = new ArrayList(16); tags.add(Tag.of(CONTROLLER, name)); addMetadataTags(resourceID, metadata, tags, true); - final var timerBuilder = Timer.builder(execName).tags(tags); + final var timerBuilder = Timer.builder(RECONCILIATION_EXECUTION_TIME).tags(tags); timerConfig.accept(timerBuilder); final var timer = timerBuilder.register(registry); try { @@ -187,27 +183,23 @@ public T timeControllerExecution(ControllerExecution execution) { throw new OperatorException(e); } }); - final var successType = execution.successTypeName(result); - registry.counter(execName + SUCCESS_SUFFIX, CONTROLLER, name, TYPE, successType).increment(); + registry.counter(CONTROLLERS_SUCCESSFUL_EXECUTION, CONTROLLER, name).increment(); return result; } catch (Exception e) { - final var exception = e.getClass().getSimpleName(); - registry - .counter(execName + FAILURE_SUFFIX, CONTROLLER, name, EXCEPTION, exception) - .increment(); + registry.counter(CONTROLLERS_FAILED_EXECUTION, CONTROLLER, name).increment(); throw e; } } @Override public void receivedEvent(Event event, Map metadata) { - if (event instanceof ResourceEvent) { + if (event instanceof ResourceEvent resourceEvent) { incrementCounter( event.getRelatedCustomResourceID(), EVENTS_RECEIVED, metadata, Tag.of(EVENT, event.getClass().getSimpleName()), - Tag.of(ACTION, ((ResourceEvent) event).getAction().toString())); + Tag.of(ACTION, resourceEvent.getAction().toString())); } else { incrementCounter( event.getRelatedCustomResourceID(), @@ -221,8 +213,6 @@ public void receivedEvent(Event event, Map metadata) { @Override public void cleanupDoneFor(ResourceID resourceID, Map metadata) { incrementCounter(resourceID, EVENTS_DELETE, metadata); - - cleaner.removeMetersFor(resourceID); } @Override @@ -234,14 +224,11 @@ public void reconcileCustomResource( // Record the counter without retry tags incrementCounter(resourceID, RECONCILIATIONS_STARTED, metadata); + // todo add metric with for resources in exhaisted retry // Update retry number gauge int retryNumber = retryInfo.map(RetryInfo::getAttemptCount).orElse(0); updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_NUMBER, retryNumber); - // Update retry last attempt gauge (1 for true, 0 for false) - int isLastAttempt = retryInfo.map(RetryInfo::isLastAttempt).orElse(true) ? 1 : 0; - updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_LAST, isLastAttempt); - var controllerQueueSize = gauges.get(controllerQueueSizeGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); controllerQueueSize.incrementAndGet(); @@ -255,7 +242,6 @@ public void successfullyFinishedReconciliation( // Reset retry gauges on successful reconciliation updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_NUMBER, 0); - updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_LAST, 0); } @Override @@ -279,17 +265,7 @@ public void reconciliationExecutionFinished(HasMetadata resource, Map metadata) { - var cause = exception.getCause(); - if (cause == null) { - cause = exception; - } else if (cause instanceof RuntimeException) { - cause = cause.getCause() != null ? cause.getCause() : cause; - } - incrementCounter( - ResourceID.fromResource(resource), - RECONCILIATIONS_FAILED, - metadata, - Tag.of(EXCEPTION, cause.getClass().getSimpleName())); + incrementCounter(ResourceID.fromResource(resource), RECONCILIATIONS_FAILED, metadata); } @Override @@ -346,8 +322,7 @@ private void incrementCounter( tags.addAll(List.of(additionalTags)); } - final var counter = registry.counter(PREFIX + counterName, tags); - cleaner.recordAssociation(id, counter); + final var counter = registry.counter(counterName, tags); counter.increment(); } @@ -356,37 +331,15 @@ private void updateGauge( final var tags = new ArrayList(6); addMetadataTags(id, metadata, tags, false); - final var gaugeRefName = buildGaugeRefName(id, gaugeName); AtomicInteger gauge = gauges.computeIfAbsent( - gaugeRefName, - key -> { - AtomicInteger newGauge = - registry.gauge(PREFIX + gaugeName, tags, new AtomicInteger(0)); - // Find the meter in the registry and record it for cleanup - var meter = registry.find(PREFIX + gaugeName).tags(tags).gauge(); - if (meter != null) { - cleaner.recordAssociation(id, meter); - } - return newGauge; - }); + gaugeName, key -> registry.gauge(gaugeName, tags, new AtomicInteger(0))); gauge.set(value); } - private String buildGaugeRefName(ResourceID id, String gaugeName) { - return gaugeName + "." + id.getName() + "." + id.getNamespace().orElse(CLUSTER); - } - - protected Set recordedMeterIdsFor(ResourceID resourceID) { - return cleaner.recordedMeterIdsFor(resourceID); - } - public static class PerResourceCollectingMicrometerMetricsBuilder extends MicrometerMetricsBuilder { - private int cleaningThreadsNumber; - private int cleanUpDelayInSeconds; - private PerResourceCollectingMicrometerMetricsBuilder( MeterRegistry registry, Consumer timerConfig) { super(registry); @@ -410,36 +363,9 @@ public PerResourceCollectingMicrometerMetricsBuilder withExecutionTimerConfig( return this; } - /** - * @param cleaningThreadsNumber the maximal number of threads that can be assigned to the - * removal of {@link Meter}s associated with deleted resources, defaults to 1 if not - * specified or if the provided number is lesser or equal to 0 - */ - public PerResourceCollectingMicrometerMetricsBuilder withCleaningThreadNumber( - int cleaningThreadsNumber) { - this.cleaningThreadsNumber = cleaningThreadsNumber <= 0 ? 1 : cleaningThreadsNumber; - return this; - } - - /** - * @param cleanUpDelayInSeconds the number of seconds to wait before {@link Meter}s are removed - * for deleted resources, defaults to 1 (meaning meters will be removed one second after the - * associated resource is deleted) if not specified or if the provided number is lesser than - * 0. Threading and the general interaction model of interacting with the API server means - * that it's not possible to ensure that meters are immediately deleted in all cases so a - * minimal delay of one second is always enforced - */ - public PerResourceCollectingMicrometerMetricsBuilder withCleanUpDelayInSeconds( - int cleanUpDelayInSeconds) { - this.cleanUpDelayInSeconds = Math.max(cleanUpDelayInSeconds, 1); - return this; - } - @Override public MicrometerMetrics build() { - final var cleaner = - new DelayedCleaner(registry, cleanUpDelayInSeconds, cleaningThreadsNumber); - return new MicrometerMetrics(registry, cleaner, true, executionTimerConfig); + return new MicrometerMetrics(registry, true, executionTimerConfig); } } @@ -484,69 +410,7 @@ public MicrometerMetricsBuilder notCollectingMetricsPerResource() { } public MicrometerMetrics build() { - return new MicrometerMetrics( - registry, Cleaner.NOOP, collectingPerResourceMetrics, executionTimerConfig); - } - } - - interface Cleaner { - Cleaner NOOP = new Cleaner() {}; - - default void removeMetersFor(ResourceID resourceID) {} - - default void recordAssociation(ResourceID resourceID, Meter meter) {} - - default Set recordedMeterIdsFor(ResourceID resourceID) { - return Collections.emptySet(); - } - } - - static class DefaultCleaner implements Cleaner { - private final Map> metersPerResource = new ConcurrentHashMap<>(); - private final MeterRegistry registry; - - private DefaultCleaner(MeterRegistry registry) { - this.registry = registry; - } - - @Override - public void removeMetersFor(ResourceID resourceID) { - // remove each meter - final var toClean = metersPerResource.get(resourceID); - if (toClean != null) { - toClean.forEach(registry::remove); - } - // then clean-up local recording of associations - metersPerResource.remove(resourceID); - } - - @Override - public void recordAssociation(ResourceID resourceID, Meter meter) { - metersPerResource.computeIfAbsent(resourceID, id -> new HashSet<>()).add(meter.getId()); - } - - @Override - public Set recordedMeterIdsFor(ResourceID resourceID) { - return metersPerResource.get(resourceID); - } - } - - static class DelayedCleaner extends MicrometerMetrics.DefaultCleaner { - private final ScheduledExecutorService metersCleaner; - private final int cleanUpDelayInSeconds; - - private DelayedCleaner( - MeterRegistry registry, int cleanUpDelayInSeconds, int cleaningThreadsNumber) { - super(registry); - this.cleanUpDelayInSeconds = cleanUpDelayInSeconds; - this.metersCleaner = Executors.newScheduledThreadPool(cleaningThreadsNumber); - } - - @Override - public void removeMetersFor(ResourceID resourceID) { - // schedule deletion of meters associated with ResourceID - metersCleaner.schedule( - () -> super.removeMetersFor(resourceID), cleanUpDelayInSeconds, TimeUnit.SECONDS); + return new MicrometerMetrics(registry, collectingPerResourceMetrics, executionTimerConfig); } } } diff --git a/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/AbstractMicrometerMetricsTestFixture.java b/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/AbstractMicrometerMetricsTestFixture.java index 660ac5381c..b0346a2444 100644 --- a/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/AbstractMicrometerMetricsTestFixture.java +++ b/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/AbstractMicrometerMetricsTestFixture.java @@ -30,7 +30,6 @@ import io.micrometer.core.instrument.Meter; import io.micrometer.core.instrument.simple.SimpleMeterRegistry; -import static org.assertj.core.api.Assertions.assertThat; import static org.awaitility.Awaitility.await; @TestInstance(TestInstance.Lifecycle.PER_CLASS) @@ -66,28 +65,12 @@ void properlyHandlesResourceDeletion() throws Exception { .isEmpty()); final var resourceID = ResourceID.fromResource(created); - final var meters = preDeleteChecks(resourceID); // delete the resource and wait for it to be deleted operator.delete(testResource); await().until(() -> operator.get(ConfigMap.class, testResourceName) == null); - - postDeleteChecks(resourceID, meters); - } - - protected Set preDeleteChecks(ResourceID resourceID) { - // check that we properly recorded meters associated with the resource - final var meters = metrics.recordedMeterIdsFor(resourceID); - // metrics are collected per resource - assertThat(registry.getMetersAsString()).contains(resourceID.getName()); - assertThat(meters).isNotNull(); - assertThat(meters).isNotEmpty(); - return meters; } - protected void postDeleteChecks(ResourceID resourceID, Set recordedMeters) - throws Exception {} - @ControllerConfiguration private static class MetricsCleaningTestReconciler implements Reconciler, Cleaner { diff --git a/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/DefaultBehaviorIT.java b/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/DefaultBehaviorIT.java index 928b01f55e..21376ea58d 100644 --- a/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/DefaultBehaviorIT.java +++ b/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/DefaultBehaviorIT.java @@ -15,32 +15,9 @@ */ package io.javaoperatorsdk.operator.monitoring.micrometer; -import java.util.Collections; -import java.util.Set; - -import io.javaoperatorsdk.operator.processing.event.ResourceID; -import io.micrometer.core.instrument.Meter; - -import static org.assertj.core.api.Assertions.assertThat; - public class DefaultBehaviorIT extends AbstractMicrometerMetricsTestFixture { @Override protected MicrometerMetrics getMetrics() { return MicrometerMetrics.newMicrometerMetricsBuilder(registry).build(); } - - @Override - protected Set preDeleteChecks(ResourceID resourceID) { - // no meter should be recorded because we're not tracking anything to be deleted later - assertThat(metrics.recordedMeterIdsFor(resourceID)).isEmpty(); - // metrics are collected per resource by default for now, this will change in a future release - assertThat(registry.getMetersAsString()).contains(resourceID.getName()); - return Collections.emptySet(); - } - - @Override - protected void postDeleteChecks(ResourceID resourceID, Set recordedMeters) { - // meters should be neither recorded, nor removed by default - assertThat(registry.getRemoved()).isEmpty(); - } } diff --git a/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/DelayedMetricsCleaningOnDeleteIT.java b/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/DelayedMetricsCleaningOnDeleteIT.java deleted file mode 100644 index bfed1f1089..0000000000 --- a/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/DelayedMetricsCleaningOnDeleteIT.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright Java Operator SDK Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.javaoperatorsdk.operator.monitoring.micrometer; - -import java.time.Duration; -import java.util.Set; - -import io.javaoperatorsdk.operator.processing.event.ResourceID; -import io.micrometer.core.instrument.Meter; - -import static org.assertj.core.api.Assertions.assertThat; - -public class DelayedMetricsCleaningOnDeleteIT extends AbstractMicrometerMetricsTestFixture { - - private static final int testDelay = 1; - - @Override - protected MicrometerMetrics getMetrics() { - return MicrometerMetrics.newPerResourceCollectingMicrometerMetricsBuilder(registry) - .withCleanUpDelayInSeconds(testDelay) - .withCleaningThreadNumber(2) - .build(); - } - - @Override - protected void postDeleteChecks(ResourceID resourceID, Set recordedMeters) - throws Exception { - // check that the meters are properly removed after the specified delay - Thread.sleep(Duration.ofSeconds(testDelay).toMillis()); - assertThat(registry.getRemoved()).isEqualTo(recordedMeters); - assertThat(metrics.recordedMeterIdsFor(resourceID)).isNull(); - } -} diff --git a/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/NoPerResourceCollectionIT.java b/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/NoPerResourceCollectionIT.java index c8dc32cd91..2fcd5c152f 100644 --- a/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/NoPerResourceCollectionIT.java +++ b/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/NoPerResourceCollectionIT.java @@ -15,24 +15,9 @@ */ package io.javaoperatorsdk.operator.monitoring.micrometer; -import java.util.Collections; -import java.util.Set; - -import io.javaoperatorsdk.operator.processing.event.ResourceID; -import io.micrometer.core.instrument.Meter; - -import static org.assertj.core.api.Assertions.assertThat; - public class NoPerResourceCollectionIT extends AbstractMicrometerMetricsTestFixture { @Override protected MicrometerMetrics getMetrics() { return MicrometerMetrics.withoutPerResourceMetrics(registry); } - - @Override - protected Set preDeleteChecks(ResourceID resourceID) { - assertThat(metrics.recordedMeterIdsFor(resourceID)).isEmpty(); - assertThat(registry.getMetersAsString()).doesNotContain(resourceID.getName()); - return Collections.emptySet(); - } } diff --git a/observability/josdk-operator-metrics-dashboard.json b/observability/josdk-operator-metrics-dashboard.json index 6b6236cd2b..0ec869978e 100644 --- a/observability/josdk-operator-metrics-dashboard.json +++ b/observability/josdk-operator-metrics-dashboard.json @@ -369,13 +369,13 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(operator_sdk_reconciliations_queue_size{service_name=\"josdk\"})", - "legendFormat": "Queue Size", + "expr": "sum(operator_sdk_reconciliations_active{service_name=\"josdk\"})", + "legendFormat": "Active", "range": true, "refId": "A" } ], - "title": "Reconciliation Queue Size", + "title": "Active Reconciliations", "type": "gauge" }, { @@ -585,7 +585,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.50, sum(rate(operator_sdk_controllers_execution_reconcile_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller))", + "expr": "histogram_quantile(0.50, sum(rate(operator_sdk_reconciliations_execution_duration_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller))", "legendFormat": "p50 - {{controller}}", "range": true, "refId": "A" @@ -596,7 +596,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.95, sum(rate(operator_sdk_controllers_execution_reconcile_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller))", + "expr": "histogram_quantile(0.95, sum(rate(operator_sdk_reconciliations_execution_duration_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller))", "legendFormat": "p95 - {{controller}}", "range": true, "refId": "B" @@ -607,7 +607,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.99, sum(rate(operator_sdk_controllers_execution_reconcile_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller))", + "expr": "histogram_quantile(0.99, sum(rate(operator_sdk_reconciliations_execution_duration_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller))", "legendFormat": "p99 - {{controller}}", "range": true, "refId": "C" @@ -877,8 +877,8 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(operator_sdk_controllers_execution_reconcile_success_total{service_name=\"josdk\"}[5m])) by (type)", - "legendFormat": "Success - {{type}}", + "expr": "sum(rate(operator_sdk_controllers_success_total{service_name=\"josdk\"}[5m])) by (controller)", + "legendFormat": "Success - {{controller}}", "range": true, "refId": "A" }, @@ -888,8 +888,8 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(operator_sdk_controllers_execution_reconcile_failure_total{service_name=\"josdk\"}[5m])) by (exception)", - "legendFormat": "Failure - {{exception}}", + "expr": "sum(rate(operator_sdk_controllers_failure_total{service_name=\"josdk\"}[5m])) by (controller)", + "legendFormat": "Failure - {{controller}}", "range": true, "refId": "B" } @@ -1084,112 +1084,6 @@ ], "title": "Reconciliation Retry Attempts", "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "description": "Resources currently on their last retry attempt (1 = last attempt, 0 = not last or no retry)", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "tooltip": false, - "viz": false, - "legend": false - }, - "lineInterpolation": "stepAfter", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [ - { - "options": { - "0": { - "text": "No" - }, - "1": { - "text": "Yes" - } - }, - "type": "value" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 1 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 40 - }, - "id": 13, - "options": { - "legend": { - "calcs": ["last"], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "operator_sdk_reconciliations_retries_last{service_name=\"josdk\"}", - "legendFormat": "{{kind}}/{{name}} ({{namespace}})", - "range": true, - "refId": "A" - } - ], - "title": "Resources on Last Retry Attempt", - "type": "timeseries" } ], "refresh": "10s", diff --git a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java index cda6fd167b..7b3d5a9c03 100644 --- a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java +++ b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java @@ -125,17 +125,6 @@ interface ControllerExecution { */ String controllerName(); - /** - * Retrieves the name of the successful result when the reconciliation ended positively. - * Possible values comes from the different outcomes provided by {@link - * io.javaoperatorsdk.operator.api.reconciler.UpdateControl} or {@link - * io.javaoperatorsdk.operator.api.reconciler.DeleteControl}. - * - * @param result the reconciliation result - * @return a name associated with the specified outcome - */ - String successTypeName(T result); - /** * Retrieves the {@link ResourceID} of the resource associated with the controller execution * being considered diff --git a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/Controller.java b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/Controller.java index bc3a43a9a3..3d6fc536a2 100644 --- a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/Controller.java +++ b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/Controller.java @@ -70,12 +70,7 @@ public class Controller

private static final Logger log = LoggerFactory.getLogger(Controller.class); private static final String CLEANUP = "cleanup"; - private static final String DELETE = "delete"; - private static final String FINALIZER_NOT_REMOVED = "finalizerNotRemoved"; private static final String RECONCILE = "reconcile"; - private static final String RESOURCE = "resource"; - private static final String STATUS = "status"; - private static final String BOTH = "both"; public static final String CLEANER_NOT_SUPPORTED_ON_ALL_EVENT_ERROR_MESSAGE = "Cleaner is not supported when triggerReconcilerOnAllEvents enabled."; public static final String @@ -155,18 +150,6 @@ public String controllerName() { return configuration.getName(); } - @Override - public String successTypeName(UpdateControl

result) { - String successType = RESOURCE; - if (result.isPatchStatus()) { - successType = STATUS; - } - if (result.isPatchResourceAndStatus()) { - successType = BOTH; - } - return successType; - } - @Override public ResourceID resourceID() { return ResourceID.fromResource(resource); @@ -208,11 +191,6 @@ public String controllerName() { return configuration.getName(); } - @Override - public String successTypeName(DeleteControl deleteControl) { - return deleteControl.isRemoveFinalizer() ? DELETE : FINALIZER_NOT_REMOVED; - } - @Override public ResourceID resourceID() { return ResourceID.fromResource(resource); diff --git a/sample-operators/webpage/k8s/webpage2.yaml b/sample-operators/webpage/k8s/webpage2.yaml new file mode 100644 index 0000000000..e9ae5ab19e --- /dev/null +++ b/sample-operators/webpage/k8s/webpage2.yaml @@ -0,0 +1,34 @@ +# +# Copyright Java Operator SDK Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +apiVersion: "sample.javaoperatorsdk/v1" +kind: WebPage +metadata: +# Use labels to match the resource with different reconciler implementations: +# labels: +# low-level: "true" + name: hellows2 +spec: + exposed: false + html: | + + + Hello Operator World + + + Hello World! + + diff --git a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java index ad580736c1..fb66f511d4 100644 --- a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java +++ b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java @@ -18,6 +18,7 @@ import java.io.IOException; import java.io.InputStream; import java.net.InetSocketAddress; +import java.time.Duration; import java.util.HashMap; import java.util.Map; @@ -34,12 +35,9 @@ import io.javaoperatorsdk.operator.sample.probes.StartupHandler; import io.micrometer.core.instrument.Clock; import io.micrometer.core.instrument.MeterRegistry; -import io.micrometer.core.instrument.binder.jvm.ClassLoaderMetrics; -import io.micrometer.core.instrument.binder.jvm.JvmGcMetrics; -import io.micrometer.core.instrument.binder.jvm.JvmMemoryMetrics; -import io.micrometer.core.instrument.binder.jvm.JvmThreadMetrics; -import io.micrometer.core.instrument.binder.system.ProcessorMetrics; -import io.micrometer.core.instrument.binder.system.UptimeMetrics; +import io.micrometer.core.instrument.composite.CompositeMeterRegistry; +import io.micrometer.core.instrument.logging.LoggingMeterRegistry; +import io.micrometer.core.instrument.logging.LoggingRegistryConfig; import io.micrometer.registry.otlp.OtlpConfig; import io.micrometer.registry.otlp.OtlpMeterRegistry; @@ -87,6 +85,9 @@ public static void main(String[] args) throws IOException { } private static @NonNull Metrics initOTLPMetrics() { + CompositeMeterRegistry compositeRegistry = new CompositeMeterRegistry(); + + // Add OTLP registry Map configProperties = loadConfigFromYaml(); var otlpConfig = new OtlpConfig() { @@ -107,18 +108,43 @@ public Map resourceAttributes() { } }; - MeterRegistry registry = new OtlpMeterRegistry(otlpConfig, Clock.SYSTEM); + MeterRegistry otlpRegistry = new OtlpMeterRegistry(otlpConfig, Clock.SYSTEM); + compositeRegistry.add(otlpRegistry); + + // Add console logging registry if enabled (for development) + // String enableConsoleLogging = System.getenv("METRICS_CONSOLE_LOGGING"); + // todo remove + String enableConsoleLogging = "true"; + if ("true".equalsIgnoreCase(enableConsoleLogging)) { + log.info("Console metrics logging enabled"); + LoggingMeterRegistry loggingRegistry = + new LoggingMeterRegistry( + new LoggingRegistryConfig() { + @Override + public String get(String key) { + return null; + } + + @Override + public Duration step() { + return Duration.ofSeconds(10); // Log metrics every 30 seconds + } + }, + Clock.SYSTEM); + compositeRegistry.add(loggingRegistry); + } // Register JVM and system metrics log.info("Registering JVM and system metrics..."); - new JvmMemoryMetrics().bindTo(registry); - new JvmGcMetrics().bindTo(registry); - new JvmThreadMetrics().bindTo(registry); - new ClassLoaderMetrics().bindTo(registry); - new ProcessorMetrics().bindTo(registry); - new UptimeMetrics().bindTo(registry); - - return MicrometerMetrics.newPerResourceCollectingMicrometerMetricsBuilder(registry) + // todo add back + // new JvmMemoryMetrics().bindTo(compositeRegistry); + // new JvmGcMetrics().bindTo(compositeRegistry); + // new JvmThreadMetrics().bindTo(compositeRegistry); + // new ClassLoaderMetrics().bindTo(compositeRegistry); + // new ProcessorMetrics().bindTo(compositeRegistry); + // new UptimeMetrics().bindTo(compositeRegistry); + + return MicrometerMetrics.newPerResourceCollectingMicrometerMetricsBuilder(compositeRegistry) .collectingMetricsPerResource() .build(); } diff --git a/sample-operators/webpage/src/main/resources/log4j2.xml b/sample-operators/webpage/src/main/resources/log4j2.xml index ebe273e40e..7cced1edbd 100644 --- a/sample-operators/webpage/src/main/resources/log4j2.xml +++ b/sample-operators/webpage/src/main/resources/log4j2.xml @@ -19,7 +19,7 @@ - + From a0b98036b9a4cf909ca67e2143bf72a3c67c378c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Tue, 10 Feb 2026 19:45:40 +0100 Subject: [PATCH 21/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- .../micrometer/MicrometerMetrics.java | 297 +++++++------ .../micrometer/MicrometerMetricsV2.java | 413 ++++++++++++++++++ .../AbstractMicrometerMetricsTestFixture.java | 17 + .../micrometer/DefaultBehaviorIT.java | 23 + .../DelayedMetricsCleaningOnDeleteIT.java | 46 ++ .../micrometer/NoPerResourceCollectionIT.java | 15 + .../operator/api/monitoring/Metrics.java | 21 + .../operator/processing/Controller.java | 22 + .../operator/sample/MySQLSchemaOperator.java | 4 +- .../operator/sample/WebPageOperator.java | 4 +- 10 files changed, 735 insertions(+), 127 deletions(-) create mode 100644 micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetricsV2.java create mode 100644 micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/DelayedMetricsCleaningOnDeleteIT.java diff --git a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java index 45f1517864..26971e7fa9 100644 --- a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java +++ b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetrics.java @@ -17,10 +17,10 @@ import java.util.*; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.Consumer; - -import org.jspecify.annotations.NonNull; import io.fabric8.kubernetes.api.model.HasMetadata; import io.javaoperatorsdk.operator.OperatorException; @@ -32,23 +32,25 @@ import io.javaoperatorsdk.operator.processing.event.Event; import io.javaoperatorsdk.operator.processing.event.ResourceID; import io.javaoperatorsdk.operator.processing.event.source.controller.ResourceEvent; +import io.micrometer.core.instrument.Meter; import io.micrometer.core.instrument.MeterRegistry; import io.micrometer.core.instrument.Tag; import io.micrometer.core.instrument.Timer; +import static io.javaoperatorsdk.operator.api.reconciler.Constants.CONTROLLER_NAME; + +@Deprecated(forRemoval = true) public class MicrometerMetrics implements Metrics { - private static final String SUCCESS_SUFFIX = "success"; - private static final String FAILURE_SUFFIX = "failure"; private static final String PREFIX = "operator.sdk."; private static final String RECONCILIATIONS = "reconciliations."; - private static final String RECONCILIATIONS_FAILED = PREFIX + RECONCILIATIONS + FAILURE_SUFFIX; - private static final String RECONCILIATIONS_SUCCESS = PREFIX + RECONCILIATIONS + SUCCESS_SUFFIX; - private static final String RECONCILIATIONS_RETRIES_NUMBER = - PREFIX + RECONCILIATIONS + "retries.number"; - private static final String RECONCILIATIONS_STARTED = PREFIX + RECONCILIATIONS + "started"; - private static final String RECONCILIATIONS_EXECUTIONS = PREFIX + RECONCILIATIONS + "executions"; - private static final String RECONCILIATIONS_QUEUE_SIZE = PREFIX + RECONCILIATIONS + "active"; + private static final String RECONCILIATIONS_FAILED = RECONCILIATIONS + "failed"; + private static final String RECONCILIATIONS_SUCCESS = RECONCILIATIONS + "success"; + private static final String RECONCILIATIONS_RETRIES_LAST = RECONCILIATIONS + "retries.last"; + private static final String RECONCILIATIONS_RETRIES_NUMBER = RECONCILIATIONS + "retries.number"; + private static final String RECONCILIATIONS_STARTED = RECONCILIATIONS + "started"; + private static final String RECONCILIATIONS_EXECUTIONS = PREFIX + RECONCILIATIONS + "executions."; + private static final String RECONCILIATIONS_QUEUE_SIZE = PREFIX + RECONCILIATIONS + "queue.size."; private static final String NAME = "name"; private static final String NAMESPACE = "namespace"; private static final String GROUP = "group"; @@ -56,26 +58,23 @@ public class MicrometerMetrics implements Metrics { private static final String KIND = "kind"; private static final String SCOPE = "scope"; private static final String METADATA_PREFIX = "resource."; - private static final String CONTROLLERS = "controllers."; - private static final String RECONCILIATION_EXECUTION_TIME = - PREFIX + RECONCILIATIONS + "execution" + ".duration"; - private static final String CONTROLLERS_SUCCESSFUL_EXECUTION = - PREFIX + CONTROLLERS + SUCCESS_SUFFIX; - private static final String CONTROLLERS_FAILED_EXECUTION = PREFIX + CONTROLLERS + FAILURE_SUFFIX; + private static final String CONTROLLERS_EXECUTION = "controllers.execution."; private static final String CONTROLLER = "controller"; - private static final String CONTROLLER_NAME = CONTROLLER + ".name"; + private static final String SUCCESS_SUFFIX = ".success"; + private static final String FAILURE_SUFFIX = ".failure"; + private static final String TYPE = "type"; + private static final String EXCEPTION = "exception"; private static final String EVENT = "event"; private static final String ACTION = "action"; - private static final String EVENTS_RECEIVED = PREFIX + "events.received"; - private static final String EVENTS_DELETE = PREFIX + "events.delete"; + private static final String EVENTS_RECEIVED = "events.received"; + private static final String EVENTS_DELETE = "events.delete"; private static final String CLUSTER = "cluster"; private static final String SIZE_SUFFIX = ".size"; private static final String UNKNOWN_ACTION = "UNKNOWN"; private final boolean collectPerResourceMetrics; private final MeterRegistry registry; - // todo double check if we actually need this private final Map gauges = new ConcurrentHashMap<>(); - private final Consumer timerConfig; + private final Cleaner cleaner; /** * Creates a MicrometerMetrics instance configured to not collect per-resource metrics, just @@ -85,7 +84,7 @@ public class MicrometerMetrics implements Metrics { * @return a MicrometerMetrics instance configured to not collect per-resource metrics */ public static MicrometerMetrics withoutPerResourceMetrics(MeterRegistry registry) { - return new MicrometerMetrics(registry, false, null); + return new MicrometerMetrics(registry, Cleaner.NOOP, false); } /** @@ -109,70 +108,58 @@ public static MicrometerMetricsBuilder newMicrometerMetricsBuilder(MeterRegistry */ public static PerResourceCollectingMicrometerMetricsBuilder newPerResourceCollectingMicrometerMetricsBuilder(MeterRegistry registry) { - return new PerResourceCollectingMicrometerMetricsBuilder(registry, null); + return new PerResourceCollectingMicrometerMetricsBuilder(registry); } - // todo as v2 class - // todo make backwards compatible /** - * Creates a micrometer-based Metrics implementation. + * Creates a micrometer-based Metrics implementation that cleans up {@link Meter}s associated with + * deleted resources as specified by the (possibly {@code null}) provided {@link Cleaner} + * instance. * * @param registry the {@link MeterRegistry} instance to use for metrics recording + * @param cleaner the {@link Cleaner} to use * @param collectingPerResourceMetrics whether to collect per resource metrics - * @param timerConfig optional configuration for timers, defaults to publishing percentiles 0.5, - * 0.95, 0.99 and histogram */ private MicrometerMetrics( - MeterRegistry registry, - boolean collectingPerResourceMetrics, - Consumer timerConfig) { + MeterRegistry registry, Cleaner cleaner, boolean collectingPerResourceMetrics) { this.registry = registry; + this.cleaner = cleaner; this.collectPerResourceMetrics = collectingPerResourceMetrics; - this.timerConfig = - timerConfig != null - ? timerConfig - : builder -> builder.publishPercentiles(0.5, 0.95, 0.99).publishPercentileHistogram(); } @Override public void controllerRegistered(Controller controller) { final var configuration = controller.getConfiguration(); final var name = configuration.getName(); - final var executingThreadsRefName = reconciliationExecutionGaugeRefName(name); + final var executingThreadsName = RECONCILIATIONS_EXECUTIONS + name; final var resourceClass = configuration.getResourceClass(); - final var tags = new ArrayList(); - tags.add(Tag.of(CONTROLLER_NAME, name)); + final var tags = new ArrayList(3); addGVKTags(GroupVersionKind.gvkFor(resourceClass), tags, false); AtomicInteger executingThreads = - registry.gauge(RECONCILIATIONS_EXECUTIONS, tags, new AtomicInteger(0)); - gauges.put(executingThreadsRefName, executingThreads); + registry.gauge(executingThreadsName, tags, new AtomicInteger(0)); + gauges.put(executingThreadsName, executingThreads); - final var controllerQueueRefName = controllerQueueSizeGaugeRefName(name); + final var controllerQueueName = RECONCILIATIONS_QUEUE_SIZE + name; AtomicInteger controllerQueueSize = - registry.gauge(RECONCILIATIONS_QUEUE_SIZE, tags, new AtomicInteger(0)); - gauges.put(controllerQueueRefName, controllerQueueSize); - } - - private static @NonNull String reconciliationExecutionGaugeRefName(String controllerName) { - return RECONCILIATIONS_EXECUTIONS + "." + controllerName; + registry.gauge(controllerQueueName, tags, new AtomicInteger(0)); + gauges.put(controllerQueueName, controllerQueueSize); } - private static @NonNull String controllerQueueSizeGaugeRefName(String controllerName) { - return RECONCILIATIONS_QUEUE_SIZE + "." + controllerName; - } - - // todo does it make sense to have both controller and reconciler execution counters? @Override public T timeControllerExecution(ControllerExecution execution) { final var name = execution.controllerName(); + final var execName = PREFIX + CONTROLLERS_EXECUTION + execution.name(); final var resourceID = execution.resourceID(); final var metadata = execution.metadata(); final var tags = new ArrayList(16); tags.add(Tag.of(CONTROLLER, name)); addMetadataTags(resourceID, metadata, tags, true); - final var timerBuilder = Timer.builder(RECONCILIATION_EXECUTION_TIME).tags(tags); - timerConfig.accept(timerBuilder); - final var timer = timerBuilder.register(registry); + final var timer = + Timer.builder(execName) + .tags(tags) + .publishPercentiles(0.3, 0.5, 0.95) + .publishPercentileHistogram() + .register(registry); try { final var result = timer.record( @@ -183,23 +170,27 @@ public T timeControllerExecution(ControllerExecution execution) { throw new OperatorException(e); } }); - registry.counter(CONTROLLERS_SUCCESSFUL_EXECUTION, CONTROLLER, name).increment(); + final var successType = execution.successTypeName(result); + registry.counter(execName + SUCCESS_SUFFIX, CONTROLLER, name, TYPE, successType).increment(); return result; } catch (Exception e) { - registry.counter(CONTROLLERS_FAILED_EXECUTION, CONTROLLER, name).increment(); + final var exception = e.getClass().getSimpleName(); + registry + .counter(execName + FAILURE_SUFFIX, CONTROLLER, name, EXCEPTION, exception) + .increment(); throw e; } } @Override public void receivedEvent(Event event, Map metadata) { - if (event instanceof ResourceEvent resourceEvent) { + if (event instanceof ResourceEvent) { incrementCounter( event.getRelatedCustomResourceID(), EVENTS_RECEIVED, metadata, Tag.of(EVENT, event.getClass().getSimpleName()), - Tag.of(ACTION, resourceEvent.getAction().toString())); + Tag.of(ACTION, ((ResourceEvent) event).getAction().toString())); } else { incrementCounter( event.getRelatedCustomResourceID(), @@ -213,59 +204,68 @@ public void receivedEvent(Event event, Map metadata) { @Override public void cleanupDoneFor(ResourceID resourceID, Map metadata) { incrementCounter(resourceID, EVENTS_DELETE, metadata); + + cleaner.removeMetersFor(resourceID); } @Override public void reconcileCustomResource( HasMetadata resource, RetryInfo retryInfoNullable, Map metadata) { Optional retryInfo = Optional.ofNullable(retryInfoNullable); - ResourceID resourceID = ResourceID.fromResource(resource); - - // Record the counter without retry tags - incrementCounter(resourceID, RECONCILIATIONS_STARTED, metadata); - - // todo add metric with for resources in exhaisted retry - // Update retry number gauge - int retryNumber = retryInfo.map(RetryInfo::getAttemptCount).orElse(0); - updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_NUMBER, retryNumber); + incrementCounter( + ResourceID.fromResource(resource), + RECONCILIATIONS_STARTED, + metadata, + Tag.of( + RECONCILIATIONS_RETRIES_NUMBER, + String.valueOf(retryInfo.map(RetryInfo::getAttemptCount).orElse(0))), + Tag.of( + RECONCILIATIONS_RETRIES_LAST, + String.valueOf(retryInfo.map(RetryInfo::isLastAttempt).orElse(true)))); var controllerQueueSize = - gauges.get(controllerQueueSizeGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); + gauges.get(RECONCILIATIONS_QUEUE_SIZE + metadata.get(CONTROLLER_NAME)); controllerQueueSize.incrementAndGet(); } @Override public void successfullyFinishedReconciliation( HasMetadata resource, Map metadata) { - ResourceID resourceID = ResourceID.fromResource(resource); - incrementCounter(resourceID, RECONCILIATIONS_SUCCESS, metadata); - - // Reset retry gauges on successful reconciliation - updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_NUMBER, 0); + incrementCounter(ResourceID.fromResource(resource), RECONCILIATIONS_SUCCESS, metadata); } @Override public void reconciliationExecutionStarted(HasMetadata resource, Map metadata) { var reconcilerExecutions = - gauges.get(reconciliationExecutionGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); + gauges.get(RECONCILIATIONS_EXECUTIONS + metadata.get(CONTROLLER_NAME)); reconcilerExecutions.incrementAndGet(); } @Override public void reconciliationExecutionFinished(HasMetadata resource, Map metadata) { var reconcilerExecutions = - gauges.get(reconciliationExecutionGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); + gauges.get(RECONCILIATIONS_EXECUTIONS + metadata.get(CONTROLLER_NAME)); reconcilerExecutions.decrementAndGet(); var controllerQueueSize = - gauges.get(controllerQueueSizeGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); + gauges.get(RECONCILIATIONS_QUEUE_SIZE + metadata.get(CONTROLLER_NAME)); controllerQueueSize.decrementAndGet(); } @Override public void failedReconciliation( HasMetadata resource, Exception exception, Map metadata) { - incrementCounter(ResourceID.fromResource(resource), RECONCILIATIONS_FAILED, metadata); + var cause = exception.getCause(); + if (cause == null) { + cause = exception; + } else if (cause instanceof RuntimeException) { + cause = cause.getCause() != null ? cause.getCause() : cause; + } + incrementCounter( + ResourceID.fromResource(resource), + RECONCILIATIONS_FAILED, + metadata, + Tag.of(EXCEPTION, cause.getClass().getSimpleName())); } @Override @@ -322,81 +322,71 @@ private void incrementCounter( tags.addAll(List.of(additionalTags)); } - final var counter = registry.counter(counterName, tags); + final var counter = registry.counter(PREFIX + counterName, tags); + cleaner.recordAssociation(id, counter); counter.increment(); } - private void updateGauge( - ResourceID id, Map metadata, String gaugeName, int value) { - final var tags = new ArrayList(6); - addMetadataTags(id, metadata, tags, false); - - AtomicInteger gauge = - gauges.computeIfAbsent( - gaugeName, key -> registry.gauge(gaugeName, tags, new AtomicInteger(0))); - gauge.set(value); + protected Set recordedMeterIdsFor(ResourceID resourceID) { + return cleaner.recordedMeterIdsFor(resourceID); } public static class PerResourceCollectingMicrometerMetricsBuilder extends MicrometerMetricsBuilder { - private PerResourceCollectingMicrometerMetricsBuilder( - MeterRegistry registry, Consumer timerConfig) { + private int cleaningThreadsNumber; + private int cleanUpDelayInSeconds; + + private PerResourceCollectingMicrometerMetricsBuilder(MeterRegistry registry) { super(registry); - this.executionTimerConfig = timerConfig; } /** - * Configures the Timer used for timing controller executions. By default, timers are configured - * to publish percentiles 0.5, 0.95, 0.99 and a percentile histogram. You can set: {@code - * .minimumExpectedValue(Duration.ofMillis(...)).maximumExpectedValue(Duration.ofSeconds(...)) } - * so micrometer can create the buckets for you. - * - * @param executionTimerConfig a consumer that will configure the Timer.Builder. The builder - * will already have the metric name and tags set. - * @return this builder for method chaining + * @param cleaningThreadsNumber the maximal number of threads that can be assigned to the + * removal of {@link Meter}s associated with deleted resources, defaults to 1 if not + * specified or if the provided number is lesser or equal to 0 */ - @Override - public PerResourceCollectingMicrometerMetricsBuilder withExecutionTimerConfig( - Consumer executionTimerConfig) { - this.executionTimerConfig = executionTimerConfig; + public PerResourceCollectingMicrometerMetricsBuilder withCleaningThreadNumber( + int cleaningThreadsNumber) { + this.cleaningThreadsNumber = cleaningThreadsNumber <= 0 ? 1 : cleaningThreadsNumber; + return this; + } + + /** + * @param cleanUpDelayInSeconds the number of seconds to wait before {@link Meter}s are removed + * for deleted resources, defaults to 1 (meaning meters will be removed one second after the + * associated resource is deleted) if not specified or if the provided number is lesser than + * 0. Threading and the general interaction model of interacting with the API server means + * that it's not possible to ensure that meters are immediately deleted in all cases so a + * minimal delay of one second is always enforced + */ + public PerResourceCollectingMicrometerMetricsBuilder withCleanUpDelayInSeconds( + int cleanUpDelayInSeconds) { + this.cleanUpDelayInSeconds = Math.max(cleanUpDelayInSeconds, 1); return this; } @Override public MicrometerMetrics build() { - return new MicrometerMetrics(registry, true, executionTimerConfig); + final var cleaner = + new DelayedCleaner(registry, cleanUpDelayInSeconds, cleaningThreadsNumber); + return new MicrometerMetrics(registry, cleaner, true); } } public static class MicrometerMetricsBuilder { protected final MeterRegistry registry; private boolean collectingPerResourceMetrics = true; - protected Consumer executionTimerConfig = null; private MicrometerMetricsBuilder(MeterRegistry registry) { this.registry = registry; } - /** - * Configures the Timer used for timing controller executions. By default, timers are configured - * to publish percentiles 0.5, 0.95, 0.99 and a percentile histogram. - * - * @param executionTimerConfig a consumer that will configure the Timer.Builder. The builder - * will already have the metric name and tags set. - * @return this builder for method chaining - */ - public MicrometerMetricsBuilder withExecutionTimerConfig( - Consumer executionTimerConfig) { - this.executionTimerConfig = executionTimerConfig; - return this; - } - /** Configures the instance to collect metrics on a per-resource basis. */ @SuppressWarnings("unused") public PerResourceCollectingMicrometerMetricsBuilder collectingMetricsPerResource() { collectingPerResourceMetrics = true; - return new PerResourceCollectingMicrometerMetricsBuilder(registry, executionTimerConfig); + return new PerResourceCollectingMicrometerMetricsBuilder(registry); } /** @@ -410,7 +400,68 @@ public MicrometerMetricsBuilder notCollectingMetricsPerResource() { } public MicrometerMetrics build() { - return new MicrometerMetrics(registry, collectingPerResourceMetrics, executionTimerConfig); + return new MicrometerMetrics(registry, Cleaner.NOOP, collectingPerResourceMetrics); + } + } + + interface Cleaner { + Cleaner NOOP = new Cleaner() {}; + + default void removeMetersFor(ResourceID resourceID) {} + + default void recordAssociation(ResourceID resourceID, Meter meter) {} + + default Set recordedMeterIdsFor(ResourceID resourceID) { + return Collections.emptySet(); + } + } + + static class DefaultCleaner implements Cleaner { + private final Map> metersPerResource = new ConcurrentHashMap<>(); + private final MeterRegistry registry; + + private DefaultCleaner(MeterRegistry registry) { + this.registry = registry; + } + + @Override + public void removeMetersFor(ResourceID resourceID) { + // remove each meter + final var toClean = metersPerResource.get(resourceID); + if (toClean != null) { + toClean.forEach(registry::remove); + } + // then clean-up local recording of associations + metersPerResource.remove(resourceID); + } + + @Override + public void recordAssociation(ResourceID resourceID, Meter meter) { + metersPerResource.computeIfAbsent(resourceID, id -> new HashSet<>()).add(meter.getId()); + } + + @Override + public Set recordedMeterIdsFor(ResourceID resourceID) { + return metersPerResource.get(resourceID); + } + } + + static class DelayedCleaner extends MicrometerMetrics.DefaultCleaner { + private final ScheduledExecutorService metersCleaner; + private final int cleanUpDelayInSeconds; + + private DelayedCleaner( + MeterRegistry registry, int cleanUpDelayInSeconds, int cleaningThreadsNumber) { + super(registry); + this.cleanUpDelayInSeconds = cleanUpDelayInSeconds; + this.metersCleaner = Executors.newScheduledThreadPool(cleaningThreadsNumber); + } + + @Override + public void removeMetersFor(ResourceID resourceID) { + // schedule deletion of meters associated with ResourceID + metersCleaner.schedule( + () -> super.removeMetersFor(resourceID), cleanUpDelayInSeconds, TimeUnit.SECONDS); } } } diff --git a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetricsV2.java b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetricsV2.java new file mode 100644 index 0000000000..eeb20c67db --- /dev/null +++ b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetricsV2.java @@ -0,0 +1,413 @@ +/* + * Copyright Java Operator SDK Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.javaoperatorsdk.operator.monitoring.micrometer; + +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Consumer; + +import org.jspecify.annotations.NonNull; + +import io.fabric8.kubernetes.api.model.HasMetadata; +import io.javaoperatorsdk.operator.OperatorException; +import io.javaoperatorsdk.operator.api.monitoring.Metrics; +import io.javaoperatorsdk.operator.api.reconciler.Constants; +import io.javaoperatorsdk.operator.api.reconciler.RetryInfo; +import io.javaoperatorsdk.operator.processing.Controller; +import io.javaoperatorsdk.operator.processing.GroupVersionKind; +import io.javaoperatorsdk.operator.processing.event.Event; +import io.javaoperatorsdk.operator.processing.event.ResourceID; +import io.javaoperatorsdk.operator.processing.event.source.controller.ResourceEvent; +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.Tag; +import io.micrometer.core.instrument.Timer; + +public class MicrometerMetricsV2 implements Metrics { + + private static final String SUCCESS_SUFFIX = "success"; + private static final String FAILURE_SUFFIX = "failure"; + private static final String PREFIX = "operator.sdk."; + private static final String RECONCILIATIONS = "reconciliations."; + private static final String RECONCILIATIONS_FAILED = PREFIX + RECONCILIATIONS + FAILURE_SUFFIX; + private static final String RECONCILIATIONS_SUCCESS = PREFIX + RECONCILIATIONS + SUCCESS_SUFFIX; + private static final String RECONCILIATIONS_RETRIES_NUMBER = + PREFIX + RECONCILIATIONS + "retries.number"; + private static final String RECONCILIATIONS_STARTED = PREFIX + RECONCILIATIONS + "started"; + private static final String RECONCILIATIONS_EXECUTIONS = PREFIX + RECONCILIATIONS + "executions"; + private static final String RECONCILIATIONS_QUEUE_SIZE = PREFIX + RECONCILIATIONS + "active"; + private static final String NAME = "name"; + private static final String NAMESPACE = "namespace"; + private static final String GROUP = "group"; + private static final String VERSION = "version"; + private static final String KIND = "kind"; + private static final String SCOPE = "scope"; + private static final String METADATA_PREFIX = "resource."; + private static final String CONTROLLERS = "controllers."; + private static final String RECONCILIATION_EXECUTION_TIME = + PREFIX + RECONCILIATIONS + "execution" + ".duration"; + private static final String CONTROLLERS_SUCCESSFUL_EXECUTION = + PREFIX + CONTROLLERS + SUCCESS_SUFFIX; + private static final String CONTROLLERS_FAILED_EXECUTION = PREFIX + CONTROLLERS + FAILURE_SUFFIX; + private static final String CONTROLLER = "controller"; + private static final String CONTROLLER_NAME = CONTROLLER + ".name"; + private static final String EVENT = "event"; + private static final String ACTION = "action"; + private static final String EVENTS_RECEIVED = PREFIX + "events.received"; + private static final String EVENTS_DELETE = PREFIX + "events.delete"; + private static final String CLUSTER = "cluster"; + private static final String SIZE_SUFFIX = ".size"; + private static final String UNKNOWN_ACTION = "UNKNOWN"; + private final boolean collectPerResourceMetrics; + private final MeterRegistry registry; + private final Map gauges = new ConcurrentHashMap<>(); + private final Consumer timerConfig; + + /** + * Creates a MicrometerMetricsV2 instance configured to not collect per-resource metrics, just + * aggregates per resource **type** + * + * @param registry the {@link MeterRegistry} instance to use for metrics recording + * @return a MicrometerMetricsV2 instance configured to not collect per-resource metrics + */ + public static MicrometerMetricsV2 withoutPerResourceMetrics(MeterRegistry registry) { + return new MicrometerMetricsV2(registry, false, null); + } + + /** + * Creates a new builder to configure how the eventual MicrometerMetricsV2 instance will behave. + * + * @param registry the {@link MeterRegistry} instance to use for metrics recording + * @return a MicrometerMetricsV2 instance configured to not collect per-resource metrics + * @see MicrometerMetricsBuilder + */ + public static MicrometerMetricsBuilder newMicrometerMetricsBuilder(MeterRegistry registry) { + return new MicrometerMetricsBuilder(registry); + } + + /** + * Creates a new builder to configure how the eventual MicrometerMetricsV2 instance will behave, + * pre-configuring it to collect metrics per resource. + * + * @param registry the {@link MeterRegistry} instance to use for metrics recording + * @return a MicrometerMetricsV2 instance configured to not collect per-resource metrics + * @see PerResourceCollectingMicrometerMetricsBuilder + */ + public static PerResourceCollectingMicrometerMetricsBuilder + newPerResourceCollectingMicrometerMetricsBuilder(MeterRegistry registry) { + return new PerResourceCollectingMicrometerMetricsBuilder(registry, null); + } + + /** + * Creates a micrometer-based Metrics implementation. + * + * @param registry the {@link MeterRegistry} instance to use for metrics recording + * @param collectingPerResourceMetrics whether to collect per resource metrics + * @param timerConfig optional configuration for timers, defaults to publishing percentiles 0.5, + * 0.95, 0.99 and histogram + */ + private MicrometerMetricsV2( + MeterRegistry registry, + boolean collectingPerResourceMetrics, + Consumer timerConfig) { + this.registry = registry; + this.collectPerResourceMetrics = collectingPerResourceMetrics; + this.timerConfig = + timerConfig != null + ? timerConfig + : builder -> builder.publishPercentiles(0.5, 0.95, 0.99).publishPercentileHistogram(); + } + + @Override + public void controllerRegistered(Controller controller) { + final var configuration = controller.getConfiguration(); + final var name = configuration.getName(); + final var executingThreadsRefName = reconciliationExecutionGaugeRefName(name); + final var resourceClass = configuration.getResourceClass(); + final var tags = new ArrayList(); + tags.add(Tag.of(CONTROLLER_NAME, name)); + addGVKTags(GroupVersionKind.gvkFor(resourceClass), tags, false); + AtomicInteger executingThreads = + registry.gauge(RECONCILIATIONS_EXECUTIONS, tags, new AtomicInteger(0)); + gauges.put(executingThreadsRefName, executingThreads); + + final var controllerQueueRefName = controllerQueueSizeGaugeRefName(name); + AtomicInteger controllerQueueSize = + registry.gauge(RECONCILIATIONS_QUEUE_SIZE, tags, new AtomicInteger(0)); + gauges.put(controllerQueueRefName, controllerQueueSize); + } + + private static @NonNull String reconciliationExecutionGaugeRefName(String controllerName) { + return RECONCILIATIONS_EXECUTIONS + "." + controllerName; + } + + private static @NonNull String controllerQueueSizeGaugeRefName(String controllerName) { + return RECONCILIATIONS_QUEUE_SIZE + "." + controllerName; + } + + // todo does it make sense to have both controller and reconciler execution counters? + @Override + public T timeControllerExecution(ControllerExecution execution) { + final var name = execution.controllerName(); + final var resourceID = execution.resourceID(); + final var metadata = execution.metadata(); + final var tags = new ArrayList(16); + tags.add(Tag.of(CONTROLLER, name)); + addMetadataTags(resourceID, metadata, tags, true); + final var timerBuilder = Timer.builder(RECONCILIATION_EXECUTION_TIME).tags(tags); + timerConfig.accept(timerBuilder); + final var timer = timerBuilder.register(registry); + try { + final var result = + timer.record( + () -> { + try { + return execution.execute(); + } catch (Exception e) { + throw new OperatorException(e); + } + }); + registry.counter(CONTROLLERS_SUCCESSFUL_EXECUTION, CONTROLLER, name).increment(); + return result; + } catch (Exception e) { + registry.counter(CONTROLLERS_FAILED_EXECUTION, CONTROLLER, name).increment(); + throw e; + } + } + + @Override + public void receivedEvent(Event event, Map metadata) { + if (event instanceof ResourceEvent resourceEvent) { + incrementCounter( + event.getRelatedCustomResourceID(), + EVENTS_RECEIVED, + metadata, + Tag.of(EVENT, event.getClass().getSimpleName()), + Tag.of(ACTION, resourceEvent.getAction().toString())); + } else { + incrementCounter( + event.getRelatedCustomResourceID(), + EVENTS_RECEIVED, + metadata, + Tag.of(EVENT, event.getClass().getSimpleName()), + Tag.of(ACTION, UNKNOWN_ACTION)); + } + } + + @Override + public void cleanupDoneFor(ResourceID resourceID, Map metadata) { + incrementCounter(resourceID, EVENTS_DELETE, metadata); + } + + @Override + public void reconcileCustomResource( + HasMetadata resource, RetryInfo retryInfoNullable, Map metadata) { + Optional retryInfo = Optional.ofNullable(retryInfoNullable); + ResourceID resourceID = ResourceID.fromResource(resource); + + // Record the counter without retry tags + incrementCounter(resourceID, RECONCILIATIONS_STARTED, metadata); + + // todo add metric with for resources in exhaisted retry + // Update retry number gauge + int retryNumber = retryInfo.map(RetryInfo::getAttemptCount).orElse(0); + updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_NUMBER, retryNumber); + + var controllerQueueSize = + gauges.get(controllerQueueSizeGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); + controllerQueueSize.incrementAndGet(); + } + + @Override + public void successfullyFinishedReconciliation( + HasMetadata resource, Map metadata) { + ResourceID resourceID = ResourceID.fromResource(resource); + incrementCounter(resourceID, RECONCILIATIONS_SUCCESS, metadata); + + // Reset retry gauges on successful reconciliation + updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_NUMBER, 0); + } + + @Override + public void reconciliationExecutionStarted(HasMetadata resource, Map metadata) { + var reconcilerExecutions = + gauges.get(reconciliationExecutionGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); + reconcilerExecutions.incrementAndGet(); + } + + @Override + public void reconciliationExecutionFinished(HasMetadata resource, Map metadata) { + var reconcilerExecutions = + gauges.get(reconciliationExecutionGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); + reconcilerExecutions.decrementAndGet(); + + var controllerQueueSize = + gauges.get(controllerQueueSizeGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); + controllerQueueSize.decrementAndGet(); + } + + @Override + public void failedReconciliation( + HasMetadata resource, Exception exception, Map metadata) { + incrementCounter(ResourceID.fromResource(resource), RECONCILIATIONS_FAILED, metadata); + } + + @Override + public > T monitorSizeOf(T map, String name) { + return registry.gaugeMapSize(PREFIX + name + SIZE_SUFFIX, Collections.emptyList(), map); + } + + private void addMetadataTags( + ResourceID resourceID, Map metadata, List tags, boolean prefixed) { + if (collectPerResourceMetrics) { + addTag(NAME, resourceID.getName(), tags, prefixed); + addTagOmittingOnEmptyValue(NAMESPACE, resourceID.getNamespace().orElse(null), tags, prefixed); + } + addTag(SCOPE, getScope(resourceID), tags, prefixed); + final var gvk = (GroupVersionKind) metadata.get(Constants.RESOURCE_GVK_KEY); + if (gvk != null) { + addGVKTags(gvk, tags, prefixed); + } + } + + private static void addTag(String name, String value, List tags, boolean prefixed) { + tags.add(Tag.of(getPrefixedMetadataTag(name, prefixed), value)); + } + + private static void addTagOmittingOnEmptyValue( + String name, String value, List tags, boolean prefixed) { + if (value != null && !value.isBlank()) { + addTag(name, value, tags, prefixed); + } + } + + private static String getPrefixedMetadataTag(String tagName, boolean prefixed) { + return prefixed ? METADATA_PREFIX + tagName : tagName; + } + + private static String getScope(ResourceID resourceID) { + return resourceID.getNamespace().isPresent() ? NAMESPACE : CLUSTER; + } + + private static void addGVKTags(GroupVersionKind gvk, List tags, boolean prefixed) { + addTagOmittingOnEmptyValue(GROUP, gvk.getGroup(), tags, prefixed); + addTag(VERSION, gvk.getVersion(), tags, prefixed); + addTag(KIND, gvk.getKind(), tags, prefixed); + } + + private void incrementCounter( + ResourceID id, String counterName, Map metadata, Tag... additionalTags) { + final var additionalTagsNb = + additionalTags != null && additionalTags.length > 0 ? additionalTags.length : 0; + final var metadataNb = metadata != null ? metadata.size() : 0; + final var tags = new ArrayList(6 + additionalTagsNb + metadataNb); + addMetadataTags(id, metadata, tags, false); + if (additionalTagsNb > 0) { + tags.addAll(List.of(additionalTags)); + } + + final var counter = registry.counter(counterName, tags); + counter.increment(); + } + + private void updateGauge( + ResourceID id, Map metadata, String gaugeName, int value) { + final var tags = new ArrayList(6); + addMetadataTags(id, metadata, tags, false); + + AtomicInteger gauge = + gauges.computeIfAbsent( + gaugeName, key -> registry.gauge(gaugeName, tags, new AtomicInteger(0))); + gauge.set(value); + } + + public static class PerResourceCollectingMicrometerMetricsBuilder + extends MicrometerMetricsBuilder { + + private PerResourceCollectingMicrometerMetricsBuilder( + MeterRegistry registry, Consumer timerConfig) { + super(registry); + this.executionTimerConfig = timerConfig; + } + + /** + * Configures the Timer used for timing controller executions. By default, timers are configured + * to publish percentiles 0.5, 0.95, 0.99 and a percentile histogram. You can set: {@code + * .minimumExpectedValue(Duration.ofMillis(...)).maximumExpectedValue(Duration.ofSeconds(...)) } + * so micrometer can create the buckets for you. + * + * @param executionTimerConfig a consumer that will configure the Timer.Builder. The builder + * will already have the metric name and tags set. + * @return this builder for method chaining + */ + @Override + public PerResourceCollectingMicrometerMetricsBuilder withExecutionTimerConfig( + Consumer executionTimerConfig) { + this.executionTimerConfig = executionTimerConfig; + return this; + } + + @Override + public MicrometerMetricsV2 build() { + return new MicrometerMetricsV2(registry, true, executionTimerConfig); + } + } + + public static class MicrometerMetricsBuilder { + protected final MeterRegistry registry; + private boolean collectingPerResourceMetrics = true; + protected Consumer executionTimerConfig = null; + + private MicrometerMetricsBuilder(MeterRegistry registry) { + this.registry = registry; + } + + /** + * Configures the Timer used for timing controller executions. By default, timers are configured + * to publish percentiles 0.5, 0.95, 0.99 and a percentile histogram. + * + * @param executionTimerConfig a consumer that will configure the Timer.Builder. The builder + * will already have the metric name and tags set. + * @return this builder for method chaining + */ + public MicrometerMetricsBuilder withExecutionTimerConfig( + Consumer executionTimerConfig) { + this.executionTimerConfig = executionTimerConfig; + return this; + } + + /** Configures the instance to collect metrics on a per-resource basis. */ + @SuppressWarnings("unused") + public PerResourceCollectingMicrometerMetricsBuilder collectingMetricsPerResource() { + collectingPerResourceMetrics = true; + return new PerResourceCollectingMicrometerMetricsBuilder(registry, executionTimerConfig); + } + + /** + * Configures the instance to only collect metrics per resource **type**, in an aggregate + * fashion, instead of per resource instance. + */ + @SuppressWarnings("unused") + public MicrometerMetricsBuilder notCollectingMetricsPerResource() { + collectingPerResourceMetrics = false; + return this; + } + + public MicrometerMetricsV2 build() { + return new MicrometerMetricsV2(registry, collectingPerResourceMetrics, executionTimerConfig); + } + } +} diff --git a/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/AbstractMicrometerMetricsTestFixture.java b/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/AbstractMicrometerMetricsTestFixture.java index b0346a2444..660ac5381c 100644 --- a/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/AbstractMicrometerMetricsTestFixture.java +++ b/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/AbstractMicrometerMetricsTestFixture.java @@ -30,6 +30,7 @@ import io.micrometer.core.instrument.Meter; import io.micrometer.core.instrument.simple.SimpleMeterRegistry; +import static org.assertj.core.api.Assertions.assertThat; import static org.awaitility.Awaitility.await; @TestInstance(TestInstance.Lifecycle.PER_CLASS) @@ -65,12 +66,28 @@ void properlyHandlesResourceDeletion() throws Exception { .isEmpty()); final var resourceID = ResourceID.fromResource(created); + final var meters = preDeleteChecks(resourceID); // delete the resource and wait for it to be deleted operator.delete(testResource); await().until(() -> operator.get(ConfigMap.class, testResourceName) == null); + + postDeleteChecks(resourceID, meters); + } + + protected Set preDeleteChecks(ResourceID resourceID) { + // check that we properly recorded meters associated with the resource + final var meters = metrics.recordedMeterIdsFor(resourceID); + // metrics are collected per resource + assertThat(registry.getMetersAsString()).contains(resourceID.getName()); + assertThat(meters).isNotNull(); + assertThat(meters).isNotEmpty(); + return meters; } + protected void postDeleteChecks(ResourceID resourceID, Set recordedMeters) + throws Exception {} + @ControllerConfiguration private static class MetricsCleaningTestReconciler implements Reconciler, Cleaner { diff --git a/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/DefaultBehaviorIT.java b/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/DefaultBehaviorIT.java index 21376ea58d..928b01f55e 100644 --- a/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/DefaultBehaviorIT.java +++ b/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/DefaultBehaviorIT.java @@ -15,9 +15,32 @@ */ package io.javaoperatorsdk.operator.monitoring.micrometer; +import java.util.Collections; +import java.util.Set; + +import io.javaoperatorsdk.operator.processing.event.ResourceID; +import io.micrometer.core.instrument.Meter; + +import static org.assertj.core.api.Assertions.assertThat; + public class DefaultBehaviorIT extends AbstractMicrometerMetricsTestFixture { @Override protected MicrometerMetrics getMetrics() { return MicrometerMetrics.newMicrometerMetricsBuilder(registry).build(); } + + @Override + protected Set preDeleteChecks(ResourceID resourceID) { + // no meter should be recorded because we're not tracking anything to be deleted later + assertThat(metrics.recordedMeterIdsFor(resourceID)).isEmpty(); + // metrics are collected per resource by default for now, this will change in a future release + assertThat(registry.getMetersAsString()).contains(resourceID.getName()); + return Collections.emptySet(); + } + + @Override + protected void postDeleteChecks(ResourceID resourceID, Set recordedMeters) { + // meters should be neither recorded, nor removed by default + assertThat(registry.getRemoved()).isEmpty(); + } } diff --git a/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/DelayedMetricsCleaningOnDeleteIT.java b/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/DelayedMetricsCleaningOnDeleteIT.java new file mode 100644 index 0000000000..bfed1f1089 --- /dev/null +++ b/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/DelayedMetricsCleaningOnDeleteIT.java @@ -0,0 +1,46 @@ +/* + * Copyright Java Operator SDK Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.javaoperatorsdk.operator.monitoring.micrometer; + +import java.time.Duration; +import java.util.Set; + +import io.javaoperatorsdk.operator.processing.event.ResourceID; +import io.micrometer.core.instrument.Meter; + +import static org.assertj.core.api.Assertions.assertThat; + +public class DelayedMetricsCleaningOnDeleteIT extends AbstractMicrometerMetricsTestFixture { + + private static final int testDelay = 1; + + @Override + protected MicrometerMetrics getMetrics() { + return MicrometerMetrics.newPerResourceCollectingMicrometerMetricsBuilder(registry) + .withCleanUpDelayInSeconds(testDelay) + .withCleaningThreadNumber(2) + .build(); + } + + @Override + protected void postDeleteChecks(ResourceID resourceID, Set recordedMeters) + throws Exception { + // check that the meters are properly removed after the specified delay + Thread.sleep(Duration.ofSeconds(testDelay).toMillis()); + assertThat(registry.getRemoved()).isEqualTo(recordedMeters); + assertThat(metrics.recordedMeterIdsFor(resourceID)).isNull(); + } +} diff --git a/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/NoPerResourceCollectionIT.java b/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/NoPerResourceCollectionIT.java index 2fcd5c152f..c8dc32cd91 100644 --- a/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/NoPerResourceCollectionIT.java +++ b/micrometer-support/src/test/java/io/javaoperatorsdk/operator/monitoring/micrometer/NoPerResourceCollectionIT.java @@ -15,9 +15,24 @@ */ package io.javaoperatorsdk.operator.monitoring.micrometer; +import java.util.Collections; +import java.util.Set; + +import io.javaoperatorsdk.operator.processing.event.ResourceID; +import io.micrometer.core.instrument.Meter; + +import static org.assertj.core.api.Assertions.assertThat; + public class NoPerResourceCollectionIT extends AbstractMicrometerMetricsTestFixture { @Override protected MicrometerMetrics getMetrics() { return MicrometerMetrics.withoutPerResourceMetrics(registry); } + + @Override + protected Set preDeleteChecks(ResourceID resourceID) { + assertThat(metrics.recordedMeterIdsFor(resourceID)).isEmpty(); + assertThat(registry.getMetersAsString()).doesNotContain(resourceID.getName()); + return Collections.emptySet(); + } } diff --git a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java index 7b3d5a9c03..976254f62b 100644 --- a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java +++ b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java @@ -84,6 +84,16 @@ default void reconciliationExecutionFinished( */ default void cleanupDoneFor(ResourceID resourceID, Map metadata) {} + /** + * @deprecated use {@link Metrics#successfullyFinishedReconciliation(HasMetadata, Map)} + * @param resource the {@link ResourceID} associated with the resource being processed + * @param metadata metadata associated with the resource being processed + */ + @Deprecated(forRemoval = true) + default void finishedReconciliation(HasMetadata resource, Map metadata) { + successfullyFinishedReconciliation(resource, metadata); + } + /** * Called when the {@link * io.javaoperatorsdk.operator.api.reconciler.Reconciler#reconcile(HasMetadata, Context)} method @@ -125,6 +135,17 @@ interface ControllerExecution { */ String controllerName(); + /** + * Retrieves the name of the successful result when the reconciliation ended positively. + * Possible values comes from the different outcomes provided by {@link + * io.javaoperatorsdk.operator.api.reconciler.UpdateControl} or {@link + * io.javaoperatorsdk.operator.api.reconciler.DeleteControl}. + * + * @param result the reconciliation result + * @return a name associated with the specified outcome + */ + String successTypeName(T result); + /** * Retrieves the {@link ResourceID} of the resource associated with the controller execution * being considered diff --git a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/Controller.java b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/Controller.java index 3d6fc536a2..bc3a43a9a3 100644 --- a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/Controller.java +++ b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/Controller.java @@ -70,7 +70,12 @@ public class Controller

private static final Logger log = LoggerFactory.getLogger(Controller.class); private static final String CLEANUP = "cleanup"; + private static final String DELETE = "delete"; + private static final String FINALIZER_NOT_REMOVED = "finalizerNotRemoved"; private static final String RECONCILE = "reconcile"; + private static final String RESOURCE = "resource"; + private static final String STATUS = "status"; + private static final String BOTH = "both"; public static final String CLEANER_NOT_SUPPORTED_ON_ALL_EVENT_ERROR_MESSAGE = "Cleaner is not supported when triggerReconcilerOnAllEvents enabled."; public static final String @@ -150,6 +155,18 @@ public String controllerName() { return configuration.getName(); } + @Override + public String successTypeName(UpdateControl

result) { + String successType = RESOURCE; + if (result.isPatchStatus()) { + successType = STATUS; + } + if (result.isPatchResourceAndStatus()) { + successType = BOTH; + } + return successType; + } + @Override public ResourceID resourceID() { return ResourceID.fromResource(resource); @@ -191,6 +208,11 @@ public String controllerName() { return configuration.getName(); } + @Override + public String successTypeName(DeleteControl deleteControl) { + return deleteControl.isRemoveFinalizer() ? DELETE : FINALIZER_NOT_REMOVED; + } + @Override public ResourceID resourceID() { return ResourceID.fromResource(resource); diff --git a/sample-operators/mysql-schema/src/main/java/io/javaoperatorsdk/operator/sample/MySQLSchemaOperator.java b/sample-operators/mysql-schema/src/main/java/io/javaoperatorsdk/operator/sample/MySQLSchemaOperator.java index 20dafac5be..c734e60345 100644 --- a/sample-operators/mysql-schema/src/main/java/io/javaoperatorsdk/operator/sample/MySQLSchemaOperator.java +++ b/sample-operators/mysql-schema/src/main/java/io/javaoperatorsdk/operator/sample/MySQLSchemaOperator.java @@ -26,7 +26,7 @@ import org.takes.http.FtBasic; import io.javaoperatorsdk.operator.Operator; -import io.javaoperatorsdk.operator.monitoring.micrometer.MicrometerMetrics; +import io.javaoperatorsdk.operator.monitoring.micrometer.MicrometerMetricsV2; import io.javaoperatorsdk.operator.sample.dependent.ResourcePollerConfig; import io.javaoperatorsdk.operator.sample.dependent.SchemaDependentResource; import io.micrometer.core.instrument.logging.LoggingMeterRegistry; @@ -42,7 +42,7 @@ public static void main(String[] args) throws IOException { new Operator( overrider -> overrider.withMetrics( - MicrometerMetrics.withoutPerResourceMetrics(new LoggingMeterRegistry()))); + MicrometerMetricsV2.withoutPerResourceMetrics(new LoggingMeterRegistry()))); MySQLSchemaReconciler schemaReconciler = new MySQLSchemaReconciler(); diff --git a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java index fb66f511d4..e2f6a4fca6 100644 --- a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java +++ b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java @@ -30,7 +30,7 @@ import io.javaoperatorsdk.operator.Operator; import io.javaoperatorsdk.operator.api.monitoring.Metrics; -import io.javaoperatorsdk.operator.monitoring.micrometer.MicrometerMetrics; +import io.javaoperatorsdk.operator.monitoring.micrometer.MicrometerMetricsV2; import io.javaoperatorsdk.operator.sample.probes.LivenessHandler; import io.javaoperatorsdk.operator.sample.probes.StartupHandler; import io.micrometer.core.instrument.Clock; @@ -144,7 +144,7 @@ public Duration step() { // new ProcessorMetrics().bindTo(compositeRegistry); // new UptimeMetrics().bindTo(compositeRegistry); - return MicrometerMetrics.newPerResourceCollectingMicrometerMetricsBuilder(compositeRegistry) + return MicrometerMetricsV2.newPerResourceCollectingMicrometerMetricsBuilder(compositeRegistry) .collectingMetricsPerResource() .build(); } From 5a89db65150046763c126963b288c8ead6c86136 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Tue, 10 Feb 2026 21:08:01 +0100 Subject: [PATCH 22/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- .../micrometer/MicrometerMetricsV2.java | 308 ++++++------------ .../api/monitoring/AggregatedMetrics.java | 1 + .../operator/api/monitoring/Metrics.java | 50 ++- .../processing/event/EventProcessor.java | 7 +- .../operator/sample/MySQLSchemaOperator.java | 3 +- .../operator/sample/WebPageOperator.java | 5 - 6 files changed, 136 insertions(+), 238 deletions(-) diff --git a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetricsV2.java b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetricsV2.java index eeb20c67db..9b75845776 100644 --- a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetricsV2.java +++ b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetricsV2.java @@ -28,7 +28,6 @@ import io.javaoperatorsdk.operator.api.reconciler.Constants; import io.javaoperatorsdk.operator.api.reconciler.RetryInfo; import io.javaoperatorsdk.operator.processing.Controller; -import io.javaoperatorsdk.operator.processing.GroupVersionKind; import io.javaoperatorsdk.operator.processing.event.Event; import io.javaoperatorsdk.operator.processing.event.ResourceID; import io.javaoperatorsdk.operator.processing.event.source.controller.ResourceEvent; @@ -38,93 +37,72 @@ public class MicrometerMetricsV2 implements Metrics { - private static final String SUCCESS_SUFFIX = "success"; - private static final String FAILURE_SUFFIX = "failure"; private static final String PREFIX = "operator.sdk."; - private static final String RECONCILIATIONS = "reconciliations."; - private static final String RECONCILIATIONS_FAILED = PREFIX + RECONCILIATIONS + FAILURE_SUFFIX; - private static final String RECONCILIATIONS_SUCCESS = PREFIX + RECONCILIATIONS + SUCCESS_SUFFIX; - private static final String RECONCILIATIONS_RETRIES_NUMBER = - PREFIX + RECONCILIATIONS + "retries.number"; - private static final String RECONCILIATIONS_STARTED = PREFIX + RECONCILIATIONS + "started"; - private static final String RECONCILIATIONS_EXECUTIONS = PREFIX + RECONCILIATIONS + "executions"; - private static final String RECONCILIATIONS_QUEUE_SIZE = PREFIX + RECONCILIATIONS + "active"; - private static final String NAME = "name"; - private static final String NAMESPACE = "namespace"; - private static final String GROUP = "group"; - private static final String VERSION = "version"; - private static final String KIND = "kind"; - private static final String SCOPE = "scope"; - private static final String METADATA_PREFIX = "resource."; - private static final String CONTROLLERS = "controllers."; - private static final String RECONCILIATION_EXECUTION_TIME = - PREFIX + RECONCILIATIONS + "execution" + ".duration"; - private static final String CONTROLLERS_SUCCESSFUL_EXECUTION = - PREFIX + CONTROLLERS + SUCCESS_SUFFIX; - private static final String CONTROLLERS_FAILED_EXECUTION = PREFIX + CONTROLLERS + FAILURE_SUFFIX; - private static final String CONTROLLER = "controller"; - private static final String CONTROLLER_NAME = CONTROLLER + ".name"; + private static final String CONTROLLER_NAME = "controller.name"; private static final String EVENT = "event"; private static final String ACTION = "action"; private static final String EVENTS_RECEIVED = PREFIX + "events.received"; private static final String EVENTS_DELETE = PREFIX + "events.delete"; - private static final String CLUSTER = "cluster"; - private static final String SIZE_SUFFIX = ".size"; private static final String UNKNOWN_ACTION = "UNKNOWN"; - private final boolean collectPerResourceMetrics; + public static final String TOTAL_SUFFIX = ".total"; + private static final String SUCCESS_SUFFIX = "success"; + private static final String FAILURE_SUFFIX = "failure"; + + private static final String RECONCILIATIONS = "reconciliations."; + + private static final String RECONCILIATIONS_FAILED = + PREFIX + RECONCILIATIONS + FAILURE_SUFFIX + TOTAL_SUFFIX; + private static final String RECONCILIATIONS_SUCCESS = + PREFIX + RECONCILIATIONS + SUCCESS_SUFFIX + TOTAL_SUFFIX; + private static final String RECONCILIATIONS_RETRIES_NUMBER = + PREFIX + RECONCILIATIONS + "retries" + TOTAL_SUFFIX; + private static final String RECONCILIATIONS_RETRIES_LAST_ATTEMPT = + PREFIX + RECONCILIATIONS + "retries.lastattempt" + TOTAL_SUFFIX; + private static final String RECONCILIATIONS_STARTED = + PREFIX + RECONCILIATIONS + "started" + TOTAL_SUFFIX; + + private static final String CONTROLLERS = "controllers."; + + private static final String CONTROLLERS_SUCCESSFUL_EXECUTION = + PREFIX + CONTROLLERS + SUCCESS_SUFFIX + TOTAL_SUFFIX; + private static final String CONTROLLERS_FAILED_EXECUTION = + PREFIX + CONTROLLERS + FAILURE_SUFFIX + TOTAL_SUFFIX; + + private static final String RECONCILIATIONS_EXECUTIONS_GAUGE = + PREFIX + RECONCILIATIONS + "executions"; + private static final String RECONCILIATIONS_QUEUE_SIZE_GAUGE = + PREFIX + RECONCILIATIONS + "active"; + + private static final String RECONCILIATION_EXECUTION_DURATION = + PREFIX + RECONCILIATIONS + "execution.seconds"; + private final MeterRegistry registry; private final Map gauges = new ConcurrentHashMap<>(); + private final Map executionTimers = new ConcurrentHashMap<>(); private final Consumer timerConfig; - /** - * Creates a MicrometerMetricsV2 instance configured to not collect per-resource metrics, just - * aggregates per resource **type** - * - * @param registry the {@link MeterRegistry} instance to use for metrics recording - * @return a MicrometerMetricsV2 instance configured to not collect per-resource metrics - */ - public static MicrometerMetricsV2 withoutPerResourceMetrics(MeterRegistry registry) { - return new MicrometerMetricsV2(registry, false, null); - } - - /** - * Creates a new builder to configure how the eventual MicrometerMetricsV2 instance will behave. - * - * @param registry the {@link MeterRegistry} instance to use for metrics recording - * @return a MicrometerMetricsV2 instance configured to not collect per-resource metrics - * @see MicrometerMetricsBuilder - */ - public static MicrometerMetricsBuilder newMicrometerMetricsBuilder(MeterRegistry registry) { - return new MicrometerMetricsBuilder(registry); - } - /** * Creates a new builder to configure how the eventual MicrometerMetricsV2 instance will behave, * pre-configuring it to collect metrics per resource. * * @param registry the {@link MeterRegistry} instance to use for metrics recording * @return a MicrometerMetricsV2 instance configured to not collect per-resource metrics - * @see PerResourceCollectingMicrometerMetricsBuilder + * @see MicrometerMetricsV2Builder */ - public static PerResourceCollectingMicrometerMetricsBuilder - newPerResourceCollectingMicrometerMetricsBuilder(MeterRegistry registry) { - return new PerResourceCollectingMicrometerMetricsBuilder(registry, null); + public static MicrometerMetricsV2Builder newPerResourceCollectingMicrometerMetricsBuilder( + MeterRegistry registry) { + return new MicrometerMetricsV2Builder(registry); } - + /** * Creates a micrometer-based Metrics implementation. * * @param registry the {@link MeterRegistry} instance to use for metrics recording - * @param collectingPerResourceMetrics whether to collect per resource metrics * @param timerConfig optional configuration for timers, defaults to publishing percentiles 0.5, * 0.95, 0.99 and histogram */ - private MicrometerMetricsV2( - MeterRegistry registry, - boolean collectingPerResourceMetrics, - Consumer timerConfig) { + private MicrometerMetricsV2(MeterRegistry registry, Consumer timerConfig) { this.registry = registry; - this.collectPerResourceMetrics = collectingPerResourceMetrics; this.timerConfig = timerConfig != null ? timerConfig @@ -135,41 +113,32 @@ private MicrometerMetricsV2( public void controllerRegistered(Controller controller) { final var configuration = controller.getConfiguration(); final var name = configuration.getName(); - final var executingThreadsRefName = reconciliationExecutionGaugeRefName(name); - final var resourceClass = configuration.getResourceClass(); + final var executingThreadsRefName = reconciliationExecutionGaugeRefKey(name); final var tags = new ArrayList(); - tags.add(Tag.of(CONTROLLER_NAME, name)); - addGVKTags(GroupVersionKind.gvkFor(resourceClass), tags, false); + addControllerName(name, tags); AtomicInteger executingThreads = - registry.gauge(RECONCILIATIONS_EXECUTIONS, tags, new AtomicInteger(0)); + registry.gauge(RECONCILIATIONS_EXECUTIONS_GAUGE, tags, new AtomicInteger(0)); gauges.put(executingThreadsRefName, executingThreads); - final var controllerQueueRefName = controllerQueueSizeGaugeRefName(name); + final var controllerQueueRefName = controllerQueueSizeGaugeRefKey(name); AtomicInteger controllerQueueSize = - registry.gauge(RECONCILIATIONS_QUEUE_SIZE, tags, new AtomicInteger(0)); + registry.gauge(RECONCILIATIONS_QUEUE_SIZE_GAUGE, tags, new AtomicInteger(0)); gauges.put(controllerQueueRefName, controllerQueueSize); - } - - private static @NonNull String reconciliationExecutionGaugeRefName(String controllerName) { - return RECONCILIATIONS_EXECUTIONS + "." + controllerName; - } - private static @NonNull String controllerQueueSizeGaugeRefName(String controllerName) { - return RECONCILIATIONS_QUEUE_SIZE + "." + controllerName; + final var timerBuilder = Timer.builder(RECONCILIATION_EXECUTION_DURATION).tags(tags); + timerConfig.accept(timerBuilder); + var timer = timerBuilder.register(registry); + executionTimers.put(name, timer); } // todo does it make sense to have both controller and reconciler execution counters? @Override public T timeControllerExecution(ControllerExecution execution) { final var name = execution.controllerName(); - final var resourceID = execution.resourceID(); - final var metadata = execution.metadata(); - final var tags = new ArrayList(16); - tags.add(Tag.of(CONTROLLER, name)); - addMetadataTags(resourceID, metadata, tags, true); - final var timerBuilder = Timer.builder(RECONCILIATION_EXECUTION_TIME).tags(tags); - timerConfig.accept(timerBuilder); - final var timer = timerBuilder.register(registry); + final var tags = new ArrayList(1); + addControllerName(name, tags); + + final var timer = executionTimers.get(name); try { final var result = timer.record( @@ -180,10 +149,10 @@ public T timeControllerExecution(ControllerExecution execution) { throw new OperatorException(e); } }); - registry.counter(CONTROLLERS_SUCCESSFUL_EXECUTION, CONTROLLER, name).increment(); + registry.counter(CONTROLLERS_SUCCESSFUL_EXECUTION, CONTROLLER_NAME, name).increment(); return result; } catch (Exception e) { - registry.counter(CONTROLLERS_FAILED_EXECUTION, CONTROLLER, name).increment(); + registry.counter(CONTROLLERS_FAILED_EXECUTION, CONTROLLER_NAME, name).increment(); throw e; } } @@ -192,14 +161,12 @@ public T timeControllerExecution(ControllerExecution execution) { public void receivedEvent(Event event, Map metadata) { if (event instanceof ResourceEvent resourceEvent) { incrementCounter( - event.getRelatedCustomResourceID(), EVENTS_RECEIVED, metadata, Tag.of(EVENT, event.getClass().getSimpleName()), Tag.of(ACTION, resourceEvent.getAction().toString())); } else { incrementCounter( - event.getRelatedCustomResourceID(), EVENTS_RECEIVED, metadata, Tag.of(EVENT, event.getClass().getSimpleName()), @@ -209,169 +176,103 @@ public void receivedEvent(Event event, Map metadata) { @Override public void cleanupDoneFor(ResourceID resourceID, Map metadata) { - incrementCounter(resourceID, EVENTS_DELETE, metadata); + incrementCounter(EVENTS_DELETE, metadata); } @Override - public void reconcileCustomResource( + public void submittedForReconciliation( HasMetadata resource, RetryInfo retryInfoNullable, Map metadata) { Optional retryInfo = Optional.ofNullable(retryInfoNullable); - ResourceID resourceID = ResourceID.fromResource(resource); // Record the counter without retry tags - incrementCounter(resourceID, RECONCILIATIONS_STARTED, metadata); + incrementCounter(RECONCILIATIONS_STARTED, metadata); - // todo add metric with for resources in exhaisted retry - // Update retry number gauge int retryNumber = retryInfo.map(RetryInfo::getAttemptCount).orElse(0); - updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_NUMBER, retryNumber); + if (retryNumber > 0) { + incrementCounter(RECONCILIATIONS_RETRIES_NUMBER, metadata); + } + retryInfo.ifPresent( + i -> { + if (retryInfoNullable.isLastAttempt()) { + incrementCounter(RECONCILIATIONS_RETRIES_LAST_ATTEMPT, metadata); + } + }); var controllerQueueSize = - gauges.get(controllerQueueSizeGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); + gauges.get(controllerQueueSizeGaugeRefKey(getControllerName(metadata))); controllerQueueSize.incrementAndGet(); } @Override public void successfullyFinishedReconciliation( HasMetadata resource, Map metadata) { - ResourceID resourceID = ResourceID.fromResource(resource); - incrementCounter(resourceID, RECONCILIATIONS_SUCCESS, metadata); - - // Reset retry gauges on successful reconciliation - updateGauge(resourceID, metadata, RECONCILIATIONS_RETRIES_NUMBER, 0); + incrementCounter(RECONCILIATIONS_SUCCESS, metadata); } @Override public void reconciliationExecutionStarted(HasMetadata resource, Map metadata) { var reconcilerExecutions = - gauges.get(reconciliationExecutionGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); + gauges.get(reconciliationExecutionGaugeRefKey(getControllerName(metadata))); reconcilerExecutions.incrementAndGet(); } @Override public void reconciliationExecutionFinished(HasMetadata resource, Map metadata) { var reconcilerExecutions = - gauges.get(reconciliationExecutionGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); + gauges.get(reconciliationExecutionGaugeRefKey(metadata.get(CONTROLLER_NAME).toString())); reconcilerExecutions.decrementAndGet(); var controllerQueueSize = - gauges.get(controllerQueueSizeGaugeRefName(metadata.get(CONTROLLER_NAME).toString())); + gauges.get(controllerQueueSizeGaugeRefKey(metadata.get(CONTROLLER_NAME).toString())); controllerQueueSize.decrementAndGet(); } @Override public void failedReconciliation( HasMetadata resource, Exception exception, Map metadata) { - incrementCounter(ResourceID.fromResource(resource), RECONCILIATIONS_FAILED, metadata); - } - - @Override - public > T monitorSizeOf(T map, String name) { - return registry.gaugeMapSize(PREFIX + name + SIZE_SUFFIX, Collections.emptyList(), map); - } - - private void addMetadataTags( - ResourceID resourceID, Map metadata, List tags, boolean prefixed) { - if (collectPerResourceMetrics) { - addTag(NAME, resourceID.getName(), tags, prefixed); - addTagOmittingOnEmptyValue(NAMESPACE, resourceID.getNamespace().orElse(null), tags, prefixed); - } - addTag(SCOPE, getScope(resourceID), tags, prefixed); - final var gvk = (GroupVersionKind) metadata.get(Constants.RESOURCE_GVK_KEY); - if (gvk != null) { - addGVKTags(gvk, tags, prefixed); - } - } - - private static void addTag(String name, String value, List tags, boolean prefixed) { - tags.add(Tag.of(getPrefixedMetadataTag(name, prefixed), value)); + incrementCounter(RECONCILIATIONS_FAILED, metadata); } - private static void addTagOmittingOnEmptyValue( - String name, String value, List tags, boolean prefixed) { - if (value != null && !value.isBlank()) { - addTag(name, value, tags, prefixed); - } - } - - private static String getPrefixedMetadataTag(String tagName, boolean prefixed) { - return prefixed ? METADATA_PREFIX + tagName : tagName; + private static void addTag(String name, String value, List tags) { + tags.add(Tag.of(name, value)); } - private static String getScope(ResourceID resourceID) { - return resourceID.getNamespace().isPresent() ? NAMESPACE : CLUSTER; + private static void addControllerName(Map metadata, List tags) { + addTag(CONTROLLER_NAME, getControllerName(metadata), tags); } - private static void addGVKTags(GroupVersionKind gvk, List tags, boolean prefixed) { - addTagOmittingOnEmptyValue(GROUP, gvk.getGroup(), tags, prefixed); - addTag(VERSION, gvk.getVersion(), tags, prefixed); - addTag(KIND, gvk.getKind(), tags, prefixed); + private static void addControllerName(String name, List tags) { + addTag(CONTROLLER_NAME, name, tags); } private void incrementCounter( - ResourceID id, String counterName, Map metadata, Tag... additionalTags) { - final var additionalTagsNb = - additionalTags != null && additionalTags.length > 0 ? additionalTags.length : 0; - final var metadataNb = metadata != null ? metadata.size() : 0; - final var tags = new ArrayList(6 + additionalTagsNb + metadataNb); - addMetadataTags(id, metadata, tags, false); - if (additionalTagsNb > 0) { + String counterName, Map metadata, Tag... additionalTags) { + + final var tags = new ArrayList(1 + additionalTags.length); + addControllerName(metadata, tags); + if (additionalTags.length > 0) { tags.addAll(List.of(additionalTags)); } - - final var counter = registry.counter(counterName, tags); - counter.increment(); + registry.counter(counterName, tags).increment(); } - private void updateGauge( - ResourceID id, Map metadata, String gaugeName, int value) { - final var tags = new ArrayList(6); - addMetadataTags(id, metadata, tags, false); - - AtomicInteger gauge = - gauges.computeIfAbsent( - gaugeName, key -> registry.gauge(gaugeName, tags, new AtomicInteger(0))); - gauge.set(value); + private static @NonNull String reconciliationExecutionGaugeRefKey(String controllerName) { + return RECONCILIATIONS_EXECUTIONS_GAUGE + "." + controllerName; } - public static class PerResourceCollectingMicrometerMetricsBuilder - extends MicrometerMetricsBuilder { - - private PerResourceCollectingMicrometerMetricsBuilder( - MeterRegistry registry, Consumer timerConfig) { - super(registry); - this.executionTimerConfig = timerConfig; - } - - /** - * Configures the Timer used for timing controller executions. By default, timers are configured - * to publish percentiles 0.5, 0.95, 0.99 and a percentile histogram. You can set: {@code - * .minimumExpectedValue(Duration.ofMillis(...)).maximumExpectedValue(Duration.ofSeconds(...)) } - * so micrometer can create the buckets for you. - * - * @param executionTimerConfig a consumer that will configure the Timer.Builder. The builder - * will already have the metric name and tags set. - * @return this builder for method chaining - */ - @Override - public PerResourceCollectingMicrometerMetricsBuilder withExecutionTimerConfig( - Consumer executionTimerConfig) { - this.executionTimerConfig = executionTimerConfig; - return this; - } + private static @NonNull String controllerQueueSizeGaugeRefKey(String controllerName) { + return RECONCILIATIONS_QUEUE_SIZE_GAUGE + "." + controllerName; + } - @Override - public MicrometerMetricsV2 build() { - return new MicrometerMetricsV2(registry, true, executionTimerConfig); - } + public static String getControllerName(Map metadata) { + return (String) metadata.get(Constants.CONTROLLER_NAME); } - public static class MicrometerMetricsBuilder { + public static class MicrometerMetricsV2Builder { protected final MeterRegistry registry; - private boolean collectingPerResourceMetrics = true; protected Consumer executionTimerConfig = null; - private MicrometerMetricsBuilder(MeterRegistry registry) { + public MicrometerMetricsV2Builder(MeterRegistry registry) { this.registry = registry; } @@ -383,31 +284,14 @@ private MicrometerMetricsBuilder(MeterRegistry registry) { * will already have the metric name and tags set. * @return this builder for method chaining */ - public MicrometerMetricsBuilder withExecutionTimerConfig( + public MicrometerMetricsV2Builder withExecutionTimerConfig( Consumer executionTimerConfig) { this.executionTimerConfig = executionTimerConfig; return this; } - /** Configures the instance to collect metrics on a per-resource basis. */ - @SuppressWarnings("unused") - public PerResourceCollectingMicrometerMetricsBuilder collectingMetricsPerResource() { - collectingPerResourceMetrics = true; - return new PerResourceCollectingMicrometerMetricsBuilder(registry, executionTimerConfig); - } - - /** - * Configures the instance to only collect metrics per resource **type**, in an aggregate - * fashion, instead of per resource instance. - */ - @SuppressWarnings("unused") - public MicrometerMetricsBuilder notCollectingMetricsPerResource() { - collectingPerResourceMetrics = false; - return this; - } - public MicrometerMetricsV2 build() { - return new MicrometerMetricsV2(registry, collectingPerResourceMetrics, executionTimerConfig); + return new MicrometerMetricsV2(registry, executionTimerConfig); } } } diff --git a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetrics.java b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetrics.java index 4e3540bf55..1764390d6f 100644 --- a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetrics.java +++ b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/AggregatedMetrics.java @@ -114,6 +114,7 @@ public T timeControllerExecution(ControllerExecution execution) throws Ex } @Override + @Deprecated(forRemoval = true) public > T monitorSizeOf(T map, String name) { metricsList.forEach(metrics -> metrics.monitorSizeOf(map, name)); return map; diff --git a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java index 976254f62b..12578ead24 100644 --- a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java +++ b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/monitoring/Metrics.java @@ -50,15 +50,30 @@ default void controllerRegistered(Controller controller) default void receivedEvent(Event event, Map metadata) {} /** - * Called right before a resource is dispatched to the ExecutorService for reconciliation. - * + * @deprecated use {@link Metrics#submittedForReconciliation(HasMetadata, RetryInfo, Map)} Called + * right before a resource is dispatched to the ExecutorService for reconciliation. * @param resource the associated with the resource * @param retryInfo the current retry state information for the reconciliation request * @param metadata metadata associated with the resource being processed */ + @Deprecated(forRemoval = true) default void reconcileCustomResource( + HasMetadata resource, RetryInfo retryInfo, Map metadata) { + submittedForReconciliation(resource, retryInfo, metadata); + } + + /** + * Called right before a resource is submitted to the ExecutorService for reconciliation. + * + * @param resource the associated with the resource + * @param retryInfo the current retry state information for the reconciliation request + * @param metadata metadata associated with the resource being processed + */ + default void submittedForReconciliation( HasMetadata resource, RetryInfo retryInfo, Map metadata) {} + default void reconciliationExecutionStarted(HasMetadata resource, Map metadata) {} + /** * Called when a precedent reconciliation for the resource associated with the specified {@link * ResourceID} resulted in the provided exception, resulting in a retry of the reconciliation. @@ -70,8 +85,24 @@ default void reconcileCustomResource( default void failedReconciliation( HasMetadata resource, Exception exception, Map metadata) {} - default void reconciliationExecutionStarted(HasMetadata resource, Map metadata) {} + /** + * Called when the {@link + * io.javaoperatorsdk.operator.api.reconciler.Reconciler#reconcile(HasMetadata, Context)} method + * of the Reconciler associated with the resource associated with the specified {@link ResourceID} + * has successfully finished. + * + * @param resource the {@link ResourceID} associated with the resource being processed + * @param metadata metadata associated with the resource being processed + */ + default void successfullyFinishedReconciliation( + HasMetadata resource, Map metadata) {} + /** + * Always called not only if successfully finished. + * + * @param resource the {@link ResourceID} associated with the resource being processed + * @param metadata metadata associated with the resource being processed + */ default void reconciliationExecutionFinished( HasMetadata resource, Map metadata) {} @@ -94,18 +125,6 @@ default void finishedReconciliation(HasMetadata resource, Map me successfullyFinishedReconciliation(resource, metadata); } - /** - * Called when the {@link - * io.javaoperatorsdk.operator.api.reconciler.Reconciler#reconcile(HasMetadata, Context)} method - * of the Reconciler associated with the resource associated with the specified {@link ResourceID} - * has sucessfully finished. - * - * @param resource the {@link ResourceID} associated with the resource being processed - * @param metadata metadata associated with the resource being processed - */ - default void successfullyFinishedReconciliation( - HasMetadata resource, Map metadata) {} - /** * Encapsulates the information about a controller execution i.e. a call to either {@link * io.javaoperatorsdk.operator.api.reconciler.Reconciler#reconcile(HasMetadata, Context)} or @@ -196,6 +215,7 @@ default T timeControllerExecution(ControllerExecution execution) throws E * @param the type of the Map being monitored */ @SuppressWarnings("unused") + @Deprecated(forRemoval = true) default > T monitorSizeOf(T map, String name) { return map; } diff --git a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/event/EventProcessor.java b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/event/EventProcessor.java index 4ff482f03e..e36ea9c600 100644 --- a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/event/EventProcessor.java +++ b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/event/EventProcessor.java @@ -292,13 +292,12 @@ synchronized void eventProcessingFinished( return; } cleanupOnSuccessfulExecution(executionScope); - metrics.successfullyFinishedReconciliation(executionScope.getResource(), metricsMetadata); + metrics.finishedReconciliation(executionScope.getResource(), metricsMetadata); if ((triggerOnAllEvents() && executionScope.isDeleteEvent()) || (!triggerOnAllEvents() && state.deleteEventPresent())) { cleanupForDeletedEvent(executionScope.getResourceID()); } else if (postExecutionControl.isFinalizerRemoved()) { state.markProcessedMarkForDeletion(); - metrics.cleanupDoneFor(resourceID, metricsMetadata); } else { if (state.eventPresent() || isTriggerOnAllEventAndDeleteEventPresent(state)) { log.debug("Submitting for reconciliation."); @@ -372,20 +371,18 @@ private void handleRetryOnException(ExecutionScope

executionScope, Exception state.eventPresent() || (triggerOnAllEvents() && state.isAdditionalEventPresentAfterDeleteEvent()); state.markEventReceived(triggerOnAllEvents()); - retryAwareErrorLogging(state.getRetry(), eventPresent, exception, executionScope); + metrics.failedReconciliation(executionScope.getResource(), exception, metricsMetadata); if (eventPresent) { log.debug("New events exists for for resource id: {}", resourceID); submitReconciliationExecution(state); return; } Optional nextDelay = state.getRetry().nextDelay(); - nextDelay.ifPresentOrElse( delay -> { log.debug( "Scheduling timer event for retry with delay:{} for resource: {}", delay, resourceID); - metrics.failedReconciliation(executionScope.getResource(), exception, metricsMetadata); retryEventSource().scheduleOnce(resourceID, delay); }, () -> { diff --git a/sample-operators/mysql-schema/src/main/java/io/javaoperatorsdk/operator/sample/MySQLSchemaOperator.java b/sample-operators/mysql-schema/src/main/java/io/javaoperatorsdk/operator/sample/MySQLSchemaOperator.java index c734e60345..3e8a9df13f 100644 --- a/sample-operators/mysql-schema/src/main/java/io/javaoperatorsdk/operator/sample/MySQLSchemaOperator.java +++ b/sample-operators/mysql-schema/src/main/java/io/javaoperatorsdk/operator/sample/MySQLSchemaOperator.java @@ -42,7 +42,8 @@ public static void main(String[] args) throws IOException { new Operator( overrider -> overrider.withMetrics( - MicrometerMetricsV2.withoutPerResourceMetrics(new LoggingMeterRegistry()))); + new MicrometerMetricsV2.MicrometerMetricsV2Builder(new LoggingMeterRegistry()) + .build())); MySQLSchemaReconciler schemaReconciler = new MySQLSchemaReconciler(); diff --git a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java index e2f6a4fca6..3166f84220 100644 --- a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java +++ b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java @@ -56,10 +56,6 @@ public class WebPageOperator { public static void main(String[] args) throws IOException { log.info("WebServer Operator starting!"); - // TODO // todo change: - // operator_sdk_reconciliations_queue_size_webpagestandalonedependentsreconciler - // operator_sdk_reconciliations_executions_webpagestandalonedependentsreconciler - // => controller name as label // TODO add test for checking if there are metrics in prometheus // Load configuration from config.yaml Metrics metrics = initOTLPMetrics(); @@ -145,7 +141,6 @@ public Duration step() { // new UptimeMetrics().bindTo(compositeRegistry); return MicrometerMetricsV2.newPerResourceCollectingMicrometerMetricsBuilder(compositeRegistry) - .collectingMetricsPerResource() .build(); } From 5c662ddc1a24ab46cede830683b682afaa5b2754 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Tue, 10 Feb 2026 21:14:53 +0100 Subject: [PATCH 23/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- .../josdk-operator-metrics-dashboard.json | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/observability/josdk-operator-metrics-dashboard.json b/observability/josdk-operator-metrics-dashboard.json index 0ec869978e..41916bbb97 100644 --- a/observability/josdk-operator-metrics-dashboard.json +++ b/observability/josdk-operator-metrics-dashboard.json @@ -103,8 +103,8 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(operator_sdk_reconciliations_started_total{service_name=\"josdk\"}[5m])) by (kind, version)", - "legendFormat": "{{kind}} ({{version}})", + "expr": "sum(rate(operator_sdk_reconciliations_started_total{service_name=\"josdk\"}[5m])) by (controller_name)", + "legendFormat": "{{controller_name}}", "range": true, "refId": "A" } @@ -224,8 +224,8 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(operator_sdk_reconciliations_success_total{service_name=\"josdk\"}[5m]))", - "legendFormat": "Success", + "expr": "sum(rate(operator_sdk_reconciliations_success_total{service_name=\"josdk\"}[5m])) by (controller_name)", + "legendFormat": "Success - {{controller_name}}", "range": true, "refId": "A" }, @@ -235,8 +235,8 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(operator_sdk_reconciliations_failed_total{service_name=\"josdk\"}[5m]))", - "legendFormat": "Failure", + "expr": "sum(rate(operator_sdk_reconciliations_failure_total{service_name=\"josdk\"}[5m])) by (controller_name)", + "legendFormat": "Failure - {{controller_name}}", "range": true, "refId": "B" } @@ -495,7 +495,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(operator_sdk_reconciliations_failed_total{service_name=\"josdk\"}[5m]))", + "expr": "sum(rate(operator_sdk_reconciliations_failure_total{service_name=\"josdk\"}[5m]))", "legendFormat": "Error Rate", "range": true, "refId": "A" @@ -585,8 +585,8 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.50, sum(rate(operator_sdk_reconciliations_execution_duration_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller))", - "legendFormat": "p50 - {{controller}}", + "expr": "histogram_quantile(0.50, sum(rate(operator_sdk_reconciliations_execution_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller_name))", + "legendFormat": "p50 - {{controller_name}}", "range": true, "refId": "A" }, @@ -596,8 +596,8 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.95, sum(rate(operator_sdk_reconciliations_execution_duration_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller))", - "legendFormat": "p95 - {{controller}}", + "expr": "histogram_quantile(0.95, sum(rate(operator_sdk_reconciliations_execution_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller_name))", + "legendFormat": "p95 - {{controller_name}}", "range": true, "refId": "B" }, @@ -607,8 +607,8 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "histogram_quantile(0.99, sum(rate(operator_sdk_reconciliations_execution_duration_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller))", - "legendFormat": "p99 - {{controller}}", + "expr": "histogram_quantile(0.99, sum(rate(operator_sdk_reconciliations_execution_seconds_bucket{service_name=\"josdk\"}[5m])) by (le, controller_name))", + "legendFormat": "p99 - {{controller_name}}", "range": true, "refId": "C" } @@ -711,7 +711,7 @@ "type": "prometheus", "uid": "prometheus" }, - "description": "Failures by exception type", + "description": "Failures by controller", "fieldConfig": { "defaults": { "color": { @@ -787,13 +787,13 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(operator_sdk_reconciliations_failed_total{service_name=\"josdk\"}[5m])) by (exception)", - "legendFormat": "{{exception}}", + "expr": "sum(rate(operator_sdk_reconciliations_failure_total{service_name=\"josdk\"}[5m])) by (controller_name)", + "legendFormat": "{{controller_name}}", "range": true, "refId": "A" } ], - "title": "Failures by Exception Type", + "title": "Failures by Controller", "type": "timeseries" }, { @@ -877,8 +877,8 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(operator_sdk_controllers_success_total{service_name=\"josdk\"}[5m])) by (controller)", - "legendFormat": "Success - {{controller}}", + "expr": "sum(rate(operator_sdk_controllers_success_total{service_name=\"josdk\"}[5m])) by (controller_name)", + "legendFormat": "Success - {{controller_name}}", "range": true, "refId": "A" }, @@ -888,8 +888,8 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(operator_sdk_controllers_failure_total{service_name=\"josdk\"}[5m])) by (controller)", - "legendFormat": "Failure - {{controller}}", + "expr": "sum(rate(operator_sdk_controllers_failure_total{service_name=\"josdk\"}[5m])) by (controller_name)", + "legendFormat": "Failure - {{controller_name}}", "range": true, "refId": "B" } @@ -978,8 +978,8 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(operator_sdk_events_delete_total{service_name=\"josdk\"}[5m])) by (kind, version)", - "legendFormat": "{{kind}} ({{version}})", + "expr": "sum(rate(operator_sdk_events_delete_total{service_name=\"josdk\"}[5m])) by (controller_name)", + "legendFormat": "{{controller_name}}", "range": true, "refId": "A" } @@ -992,7 +992,7 @@ "type": "prometheus", "uid": "prometheus" }, - "description": "Current retry attempt number for resources being retried", + "description": "Rate of retry attempts", "fieldConfig": { "defaults": { "color": { @@ -1076,13 +1076,13 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "operator_sdk_reconciliations_retries_number{service_name=\"josdk\"}", - "legendFormat": "{{kind}}/{{name}} ({{namespace}})", + "expr": "sum(rate(operator_sdk_reconciliations_retries_total{service_name=\"josdk\"}[5m])) by (controller_name)", + "legendFormat": "Retries - {{controller_name}}", "range": true, "refId": "A" } ], - "title": "Reconciliation Retry Attempts", + "title": "Reconciliation Retry Rate", "type": "timeseries" } ], From c0bed1426c69ad7b51f4c85ff497155149588421 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Wed, 11 Feb 2026 09:14:48 +0100 Subject: [PATCH 24/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- .github/workflows/e2e-test.yml | 1 + .../micrometer/MicrometerMetricsV2.java | 2 + .../kubernetes/GroupVersionKindPlural.java | 2 +- .../metrics-processing/k8s/operator.yaml | 117 ++++++++++++++ .../metrics-processing/k8s/webpage.yaml | 34 ++++ .../metrics-processing/k8s/webpage2.yaml | 34 ++++ sample-operators/metrics-processing/pom.xml | 130 +++++++++++++++ .../metrics/MetricsHandlingReconciler.java | 48 ++++++ .../MetricsHandlingSampleOperator.java | 150 ++++++++++++++++++ .../metrics/customresource/WebPage.java | 31 ++++ .../metrics/customresource/WebPageSpec.java | 44 +++++ .../metrics/customresource/WebPageStatus.java | 65 ++++++++ .../operator/sample/deployment.yaml | 42 +++++ .../operator/sample/ingress.yaml | 33 ++++ .../operator/sample/service.yaml | 28 ++++ .../src/main/resources/log4j2.xml | 30 ++++ .../src/main/resources/otlp-config.yaml | 23 +++ sample-operators/pom.xml | 1 + sample-operators/webpage/pom.xml | 21 --- .../operator/sample/WebPageOperator.java | 112 +------------ 20 files changed, 815 insertions(+), 133 deletions(-) create mode 100644 sample-operators/metrics-processing/k8s/operator.yaml create mode 100644 sample-operators/metrics-processing/k8s/webpage.yaml create mode 100644 sample-operators/metrics-processing/k8s/webpage2.yaml create mode 100644 sample-operators/metrics-processing/pom.xml create mode 100644 sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/MetricsHandlingReconciler.java create mode 100644 sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/MetricsHandlingSampleOperator.java create mode 100644 sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPage.java create mode 100644 sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPageSpec.java create mode 100644 sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPageStatus.java create mode 100644 sample-operators/metrics-processing/src/main/resources/io/javaoperatorsdk/operator/sample/deployment.yaml create mode 100644 sample-operators/metrics-processing/src/main/resources/io/javaoperatorsdk/operator/sample/ingress.yaml create mode 100644 sample-operators/metrics-processing/src/main/resources/io/javaoperatorsdk/operator/sample/service.yaml create mode 100644 sample-operators/metrics-processing/src/main/resources/log4j2.xml create mode 100644 sample-operators/metrics-processing/src/main/resources/otlp-config.yaml diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index 172f28e3f9..edfd310aed 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -25,6 +25,7 @@ jobs: - "sample-operators/tomcat-operator" - "sample-operators/webpage" - "sample-operators/leader-election" + - "sample-operators/metrics-processing" runs-on: ubuntu-latest steps: - name: Checkout diff --git a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetricsV2.java b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetricsV2.java index 9b75845776..0a1254ad16 100644 --- a/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetricsV2.java +++ b/micrometer-support/src/main/java/io/javaoperatorsdk/operator/monitoring/micrometer/MicrometerMetricsV2.java @@ -191,6 +191,8 @@ public void submittedForReconciliation( if (retryNumber > 0) { incrementCounter(RECONCILIATIONS_RETRIES_NUMBER, metadata); } + + // todo having a gauge for the number of exhausted retries? retryInfo.ifPresent( i -> { if (retryInfoNullable.isLastAttempt()) { diff --git a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/dependent/kubernetes/GroupVersionKindPlural.java b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/dependent/kubernetes/GroupVersionKindPlural.java index 4818760888..569526f4e3 100644 --- a/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/dependent/kubernetes/GroupVersionKindPlural.java +++ b/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/processing/dependent/kubernetes/GroupVersionKindPlural.java @@ -119,7 +119,7 @@ public static GroupVersionKindPlural gvkFor(Class resourc * @return the default plural form for the specified kind */ public static String getDefaultPluralFor(String kind) { - // todo: replace by Fabric8 version when available, see + // replace by Fabric8 version when available, see // https://github.com/fabric8io/kubernetes-client/pull/6314 return kind != null ? Pluralize.toPlural(kind.toLowerCase()) : null; } diff --git a/sample-operators/metrics-processing/k8s/operator.yaml b/sample-operators/metrics-processing/k8s/operator.yaml new file mode 100644 index 0000000000..2f2484561c --- /dev/null +++ b/sample-operators/metrics-processing/k8s/operator.yaml @@ -0,0 +1,117 @@ +# +# Copyright Java Operator SDK Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: webpage-operator + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: webpage-operator +spec: + selector: + matchLabels: + app: webpage-operator + replicas: 1 + template: + metadata: + labels: + app: webpage-operator + spec: + serviceAccountName: webpage-operator + containers: + - name: operator + image: webpage-operator + imagePullPolicy: Never + ports: + - containerPort: 80 + startupProbe: + httpGet: + path: /startup + port: 8080 + initialDelaySeconds: 1 + periodSeconds: 2 + timeoutSeconds: 1 + failureThreshold: 10 + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + timeoutSeconds: 1 + periodSeconds: 2 + failureThreshold: 3 + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: operator-admin +subjects: +- kind: ServiceAccount + name: webpage-operator + namespace: default +roleRef: + kind: ClusterRole + name: webpage-operator + apiGroup: "" + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: webpage-operator +rules: +- apiGroups: + - "" + resources: + - deployments + - services + - configmaps + - pods + verbs: + - '*' +- apiGroups: + - "apps" + resources: + - deployments + - services + - configmaps + verbs: + - '*' +- apiGroups: + - "apiextensions.k8s.io" + resources: + - customresourcedefinitions + verbs: + - '*' +- apiGroups: + - "sample.javaoperatorsdk" + resources: + - webpages + - webpages/status + verbs: + - '*' +- apiGroups: + - "networking.k8s.io" + resources: + - ingresses + verbs: + - '*' + diff --git a/sample-operators/metrics-processing/k8s/webpage.yaml b/sample-operators/metrics-processing/k8s/webpage.yaml new file mode 100644 index 0000000000..6a70b51282 --- /dev/null +++ b/sample-operators/metrics-processing/k8s/webpage.yaml @@ -0,0 +1,34 @@ +# +# Copyright Java Operator SDK Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +apiVersion: "sample.javaoperatorsdk/v1" +kind: WebPage +metadata: +# Use labels to match the resource with different reconciler implementations: +# labels: +# low-level: "true" + name: hellows +spec: + exposed: false + html: | + + + Hello Operator World + + + Hello World! + + diff --git a/sample-operators/metrics-processing/k8s/webpage2.yaml b/sample-operators/metrics-processing/k8s/webpage2.yaml new file mode 100644 index 0000000000..e9ae5ab19e --- /dev/null +++ b/sample-operators/metrics-processing/k8s/webpage2.yaml @@ -0,0 +1,34 @@ +# +# Copyright Java Operator SDK Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +apiVersion: "sample.javaoperatorsdk/v1" +kind: WebPage +metadata: +# Use labels to match the resource with different reconciler implementations: +# labels: +# low-level: "true" + name: hellows2 +spec: + exposed: false + html: | + + + Hello Operator World + + + Hello World! + + diff --git a/sample-operators/metrics-processing/pom.xml b/sample-operators/metrics-processing/pom.xml new file mode 100644 index 0000000000..9a327370b3 --- /dev/null +++ b/sample-operators/metrics-processing/pom.xml @@ -0,0 +1,130 @@ + + + + 4.0.0 + + + io.javaoperatorsdk + sample-operators + 5.3.0-SNAPSHOT + + + sample-metrics-proricessing + jar + Operator SDK - Samples - Metrics processing + Showcases to handle metrics setup and deploys related tooling and dashboards + + + + + io.javaoperatorsdk + operator-framework-bom + ${project.version} + pom + import + + + io.micrometer + micrometer-bom + ${micrometer-core.version} + pom + import + + + + + + + io.javaoperatorsdk + operator-framework + + + io.javaoperatorsdk + micrometer-support + + + io.micrometer + micrometer-registry-otlp + ${micrometer-core.version} + + + org.yaml + snakeyaml + 2.3 + + + org.apache.logging.log4j + log4j-slf4j2-impl + + + org.apache.logging.log4j + log4j-core + compile + + + org.takes + takes + 1.25.0 + + + org.awaitility + awaitility + compile + + + io.javaoperatorsdk + operator-framework-junit + test + + + + + + com.google.cloud.tools + jib-maven-plugin + ${jib-maven-plugin.version} + + + gcr.io/distroless/java17-debian11 + + + webpage-operator + + + + + org.apache.maven.plugins + maven-compiler-plugin + + + io.fabric8 + crd-generator-maven-plugin + ${fabric8-client.version} + + + + generate + + + + + + + + diff --git a/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/MetricsHandlingReconciler.java b/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/MetricsHandlingReconciler.java new file mode 100644 index 0000000000..c80faf00a0 --- /dev/null +++ b/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/MetricsHandlingReconciler.java @@ -0,0 +1,48 @@ +/* + * Copyright Java Operator SDK Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.javaoperatorsdk.operator.sample; + +import java.util.List; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import io.fabric8.kubernetes.api.model.*; +import io.javaoperatorsdk.operator.api.reconciler.*; +import io.javaoperatorsdk.operator.api.reconciler.Context; +import io.javaoperatorsdk.operator.processing.event.source.EventSource; +import io.javaoperatorsdk.operator.sample.customresource.WebPage; + +@ControllerConfiguration +public class MetricsHandlingReconciler implements Reconciler { + + public static final String INDEX_HTML = "index.html"; + + private static final Logger log = LoggerFactory.getLogger(MetricsHandlingReconciler.class); + + public MetricsHandlingReconciler() {} + + @Override + public List> prepareEventSources(EventSourceContext context) { + return List.of(); + } + + @Override + public UpdateControl reconcile(WebPage webPage, Context context) { + + return UpdateControl.noUpdate(); + } +} diff --git a/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/MetricsHandlingSampleOperator.java b/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/MetricsHandlingSampleOperator.java new file mode 100644 index 0000000000..053f031cc0 --- /dev/null +++ b/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/MetricsHandlingSampleOperator.java @@ -0,0 +1,150 @@ +/* + * Copyright Java Operator SDK Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.javaoperatorsdk.operator.sample; + +import java.io.IOException; +import java.io.InputStream; +import java.time.Duration; +import java.util.HashMap; +import java.util.Map; + +import org.jspecify.annotations.NonNull; +import org.jspecify.annotations.Nullable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.yaml.snakeyaml.Yaml; + +import io.javaoperatorsdk.operator.Operator; +import io.javaoperatorsdk.operator.api.monitoring.Metrics; +import io.javaoperatorsdk.operator.monitoring.micrometer.MicrometerMetricsV2; +import io.micrometer.core.instrument.Clock; +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.composite.CompositeMeterRegistry; +import io.micrometer.core.instrument.logging.LoggingMeterRegistry; +import io.micrometer.core.instrument.logging.LoggingRegistryConfig; +import io.micrometer.registry.otlp.OtlpConfig; +import io.micrometer.registry.otlp.OtlpMeterRegistry; + +public class MetricsHandlingSampleOperator { + + private static final Logger log = LoggerFactory.getLogger(MetricsHandlingSampleOperator.class); + + /** + * Based on env variables a different flavor of Reconciler is used, showcasing how the same logic + * can be implemented using the low level and higher level APIs. + */ + public static void main(String[] args) throws IOException { + log.info("WebServer Operator starting!"); + + // TODO add test for checking if there are metrics in prometheus + // Load configuration from config.yaml + Metrics metrics = initOTLPMetrics(); + Operator operator = + new Operator(o -> o.withStopOnInformerErrorDuringStartup(false).withMetrics(metrics)); + + operator.start(); + } + + private static @NonNull Metrics initOTLPMetrics() { + CompositeMeterRegistry compositeRegistry = new CompositeMeterRegistry(); + + // Add OTLP registry + Map configProperties = loadConfigFromYaml(); + var otlpConfig = + new OtlpConfig() { + @Override + public String prefix() { + return ""; + } + + @Override + public @Nullable String get(String key) { + return configProperties.get(key); + } + + // these should come from env variables + @Override + public Map resourceAttributes() { + return Map.of("service.name", "josdk", "operator", "webpage"); + } + }; + + MeterRegistry otlpRegistry = new OtlpMeterRegistry(otlpConfig, Clock.SYSTEM); + compositeRegistry.add(otlpRegistry); + + // Add console logging registry if enabled (for development) + // String enableConsoleLogging = System.getenv("METRICS_CONSOLE_LOGGING"); + // todo remove + String enableConsoleLogging = "true"; + if ("true".equalsIgnoreCase(enableConsoleLogging)) { + log.info("Console metrics logging enabled"); + LoggingMeterRegistry loggingRegistry = + new LoggingMeterRegistry( + new LoggingRegistryConfig() { + @Override + public String get(String key) { + return null; + } + + @Override + public Duration step() { + return Duration.ofSeconds(10); // Log metrics every 30 seconds + } + }, + Clock.SYSTEM); + compositeRegistry.add(loggingRegistry); + } + + // Register JVM and system metrics + log.info("Registering JVM and system metrics..."); + // todo add back + // new JvmMemoryMetrics().bindTo(compositeRegistry); + // new JvmGcMetrics().bindTo(compositeRegistry); + // new JvmThreadMetrics().bindTo(compositeRegistry); + // new ClassLoaderMetrics().bindTo(compositeRegistry); + // new ProcessorMetrics().bindTo(compositeRegistry); + // new UptimeMetrics().bindTo(compositeRegistry); + + return MicrometerMetricsV2.newPerResourceCollectingMicrometerMetricsBuilder(compositeRegistry) + .build(); + } + + @SuppressWarnings("unchecked") + private static Map loadConfigFromYaml() { + Map configMap = new HashMap<>(); + try (InputStream inputStream = + MetricsHandlingSampleOperator.class.getResourceAsStream("/otlp-config.yaml")) { + if (inputStream == null) { + log.warn("otlp-config.yaml not found in resources, using default OTLP configuration"); + return configMap; + } + + Yaml yaml = new Yaml(); + Map yamlData = yaml.load(inputStream); + + // Navigate to otlp section and map properties directly + Map otlp = (Map) yamlData.get("otlp"); + if (otlp != null) { + otlp.forEach((key, value) -> configMap.put("otlp." + key, value.toString())); + } + + log.info("Loaded OTLP configuration from otlp-config.yaml: {}", configMap); + } catch (IOException e) { + log.error("Error loading otlp-config.yaml", e); + } + return configMap; + } +} diff --git a/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPage.java b/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPage.java new file mode 100644 index 0000000000..10d7a9cf43 --- /dev/null +++ b/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPage.java @@ -0,0 +1,31 @@ +/* + * Copyright Java Operator SDK Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.javaoperatorsdk.operator.sample.customresource; + +import io.fabric8.kubernetes.api.model.Namespaced; +import io.fabric8.kubernetes.client.CustomResource; +import io.fabric8.kubernetes.model.annotation.Group; +import io.fabric8.kubernetes.model.annotation.Version; + +@Group("sample.javaoperatorsdk") +@Version("v1") +public class WebPage extends CustomResource implements Namespaced { + + @Override + public String toString() { + return "WebPage{" + "spec=" + spec + ", status=" + status + '}'; + } +} diff --git a/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPageSpec.java b/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPageSpec.java new file mode 100644 index 0000000000..ef70acea26 --- /dev/null +++ b/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPageSpec.java @@ -0,0 +1,44 @@ +/* + * Copyright Java Operator SDK Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.javaoperatorsdk.operator.sample.customresource; + +public class WebPageSpec { + + private String html; + private Boolean exposed = false; + + public String getHtml() { + return html; + } + + public void setHtml(String html) { + this.html = html; + } + + public Boolean getExposed() { + return exposed; + } + + public WebPageSpec setExposed(Boolean exposed) { + this.exposed = exposed; + return this; + } + + @Override + public String toString() { + return "WebPageSpec{" + "html='" + html + '\'' + '}'; + } +} diff --git a/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPageStatus.java b/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPageStatus.java new file mode 100644 index 0000000000..76de64e645 --- /dev/null +++ b/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPageStatus.java @@ -0,0 +1,65 @@ +/* + * Copyright Java Operator SDK Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.javaoperatorsdk.operator.sample.customresource; + +public class WebPageStatus { + + private String htmlConfigMap; + + private Boolean areWeGood; + + private String errorMessage; + + public String getHtmlConfigMap() { + return htmlConfigMap; + } + + public void setHtmlConfigMap(String htmlConfigMap) { + this.htmlConfigMap = htmlConfigMap; + } + + public Boolean getAreWeGood() { + return areWeGood; + } + + public void setAreWeGood(Boolean areWeGood) { + this.areWeGood = areWeGood; + } + + public String getErrorMessage() { + return errorMessage; + } + + public WebPageStatus setErrorMessage(String errorMessage) { + this.errorMessage = errorMessage; + return this; + } + + @Override + public String toString() { + return "WebPageStatus{" + + "htmlConfigMap='" + + htmlConfigMap + + '\'' + + ", areWeGood='" + + areWeGood + + '\'' + + ", errorMessage='" + + errorMessage + + '\'' + + '}'; + } +} diff --git a/sample-operators/metrics-processing/src/main/resources/io/javaoperatorsdk/operator/sample/deployment.yaml b/sample-operators/metrics-processing/src/main/resources/io/javaoperatorsdk/operator/sample/deployment.yaml new file mode 100644 index 0000000000..3cbbd83222 --- /dev/null +++ b/sample-operators/metrics-processing/src/main/resources/io/javaoperatorsdk/operator/sample/deployment.yaml @@ -0,0 +1,42 @@ +# +# Copyright Java Operator SDK Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +apiVersion: apps/v1 # for versions before 1.9.0 use apps/v1beta2 +kind: Deployment +metadata: + name: "" +spec: + selector: + matchLabels: + app: "" + replicas: 1 + template: + metadata: + labels: + app: "" + spec: + containers: + - name: nginx + image: nginx:1.17.0 + ports: + - containerPort: 80 + volumeMounts: + - name: html-volume + mountPath: /usr/share/nginx/html + volumes: + - name: html-volume + configMap: + name: "" \ No newline at end of file diff --git a/sample-operators/metrics-processing/src/main/resources/io/javaoperatorsdk/operator/sample/ingress.yaml b/sample-operators/metrics-processing/src/main/resources/io/javaoperatorsdk/operator/sample/ingress.yaml new file mode 100644 index 0000000000..b037c75dda --- /dev/null +++ b/sample-operators/metrics-processing/src/main/resources/io/javaoperatorsdk/operator/sample/ingress.yaml @@ -0,0 +1,33 @@ +# +# Copyright Java Operator SDK Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: "" + annotations: + nginx.ingress.kubernetes.io/rewrite-target: /$1 +spec: + rules: + - http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: "" + port: + number: 80 \ No newline at end of file diff --git a/sample-operators/metrics-processing/src/main/resources/io/javaoperatorsdk/operator/sample/service.yaml b/sample-operators/metrics-processing/src/main/resources/io/javaoperatorsdk/operator/sample/service.yaml new file mode 100644 index 0000000000..8131b24cb3 --- /dev/null +++ b/sample-operators/metrics-processing/src/main/resources/io/javaoperatorsdk/operator/sample/service.yaml @@ -0,0 +1,28 @@ +# +# Copyright Java Operator SDK Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +apiVersion: v1 +kind: Service +metadata: + name: "" +spec: + selector: + app: "" + ports: + - protocol: TCP + port: 80 + targetPort: 80 + type: NodePort \ No newline at end of file diff --git a/sample-operators/metrics-processing/src/main/resources/log4j2.xml b/sample-operators/metrics-processing/src/main/resources/log4j2.xml new file mode 100644 index 0000000000..7cced1edbd --- /dev/null +++ b/sample-operators/metrics-processing/src/main/resources/log4j2.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/sample-operators/metrics-processing/src/main/resources/otlp-config.yaml b/sample-operators/metrics-processing/src/main/resources/otlp-config.yaml new file mode 100644 index 0000000000..17d773eb70 --- /dev/null +++ b/sample-operators/metrics-processing/src/main/resources/otlp-config.yaml @@ -0,0 +1,23 @@ +# +# Copyright Java Operator SDK Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +otlp: + # OTLP Collector endpoint - see observability/install-observability.sh for setup + url: "http://localhost:4318/v1/metrics" +# url: "http://otel-collector-collector.observability.svc.cluster.local:4318/v1/metrics" + step: 15s + batchSize: 15000 + aggregationTemporality: "cumulative" diff --git a/sample-operators/pom.xml b/sample-operators/pom.xml index 6079d3bb71..d9a9c61f4d 100644 --- a/sample-operators/pom.xml +++ b/sample-operators/pom.xml @@ -35,5 +35,6 @@ mysql-schema leader-election controller-namespace-deletion + metrics-processing diff --git a/sample-operators/webpage/pom.xml b/sample-operators/webpage/pom.xml index f8c79cf268..6ec60340ae 100644 --- a/sample-operators/webpage/pom.xml +++ b/sample-operators/webpage/pom.xml @@ -39,13 +39,6 @@ pom import - - io.micrometer - micrometer-bom - ${micrometer-core.version} - pom - import - @@ -54,20 +47,6 @@ io.javaoperatorsdk operator-framework - - io.javaoperatorsdk - micrometer-support - - - io.micrometer - micrometer-registry-otlp - ${micrometer-core.version} - - - org.yaml - snakeyaml - 2.3 - org.apache.logging.log4j log4j-slf4j2-impl diff --git a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java index 3166f84220..5366dc2e9a 100644 --- a/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java +++ b/sample-operators/webpage/src/main/java/io/javaoperatorsdk/operator/sample/WebPageOperator.java @@ -16,30 +16,14 @@ package io.javaoperatorsdk.operator.sample; import java.io.IOException; -import java.io.InputStream; import java.net.InetSocketAddress; -import java.time.Duration; -import java.util.HashMap; -import java.util.Map; -import org.jspecify.annotations.NonNull; -import org.jspecify.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.yaml.snakeyaml.Yaml; import io.javaoperatorsdk.operator.Operator; -import io.javaoperatorsdk.operator.api.monitoring.Metrics; -import io.javaoperatorsdk.operator.monitoring.micrometer.MicrometerMetricsV2; import io.javaoperatorsdk.operator.sample.probes.LivenessHandler; import io.javaoperatorsdk.operator.sample.probes.StartupHandler; -import io.micrometer.core.instrument.Clock; -import io.micrometer.core.instrument.MeterRegistry; -import io.micrometer.core.instrument.composite.CompositeMeterRegistry; -import io.micrometer.core.instrument.logging.LoggingMeterRegistry; -import io.micrometer.core.instrument.logging.LoggingRegistryConfig; -import io.micrometer.registry.otlp.OtlpConfig; -import io.micrometer.registry.otlp.OtlpMeterRegistry; import com.sun.net.httpserver.HttpServer; @@ -56,12 +40,7 @@ public class WebPageOperator { public static void main(String[] args) throws IOException { log.info("WebServer Operator starting!"); - // TODO add test for checking if there are metrics in prometheus - // Load configuration from config.yaml - Metrics metrics = initOTLPMetrics(); - Operator operator = - new Operator(o -> o.withStopOnInformerErrorDuringStartup(false).withMetrics(metrics)); - + Operator operator = new Operator(o -> o.withStopOnInformerErrorDuringStartup(false)); String reconcilerEnvVar = System.getenv(WEBPAGE_RECONCILER_ENV); if (WEBPAGE_CLASSIC_RECONCILER_ENV_VALUE.equals(reconcilerEnvVar)) { operator.register(new WebPageReconciler()); @@ -79,93 +58,4 @@ public static void main(String[] args) throws IOException { server.setExecutor(null); server.start(); } - - private static @NonNull Metrics initOTLPMetrics() { - CompositeMeterRegistry compositeRegistry = new CompositeMeterRegistry(); - - // Add OTLP registry - Map configProperties = loadConfigFromYaml(); - var otlpConfig = - new OtlpConfig() { - @Override - public String prefix() { - return ""; - } - - @Override - public @Nullable String get(String key) { - return configProperties.get(key); - } - - // these should come from env variables - @Override - public Map resourceAttributes() { - return Map.of("service.name", "josdk", "operator", "webpage"); - } - }; - - MeterRegistry otlpRegistry = new OtlpMeterRegistry(otlpConfig, Clock.SYSTEM); - compositeRegistry.add(otlpRegistry); - - // Add console logging registry if enabled (for development) - // String enableConsoleLogging = System.getenv("METRICS_CONSOLE_LOGGING"); - // todo remove - String enableConsoleLogging = "true"; - if ("true".equalsIgnoreCase(enableConsoleLogging)) { - log.info("Console metrics logging enabled"); - LoggingMeterRegistry loggingRegistry = - new LoggingMeterRegistry( - new LoggingRegistryConfig() { - @Override - public String get(String key) { - return null; - } - - @Override - public Duration step() { - return Duration.ofSeconds(10); // Log metrics every 30 seconds - } - }, - Clock.SYSTEM); - compositeRegistry.add(loggingRegistry); - } - - // Register JVM and system metrics - log.info("Registering JVM and system metrics..."); - // todo add back - // new JvmMemoryMetrics().bindTo(compositeRegistry); - // new JvmGcMetrics().bindTo(compositeRegistry); - // new JvmThreadMetrics().bindTo(compositeRegistry); - // new ClassLoaderMetrics().bindTo(compositeRegistry); - // new ProcessorMetrics().bindTo(compositeRegistry); - // new UptimeMetrics().bindTo(compositeRegistry); - - return MicrometerMetricsV2.newPerResourceCollectingMicrometerMetricsBuilder(compositeRegistry) - .build(); - } - - @SuppressWarnings("unchecked") - private static Map loadConfigFromYaml() { - Map configMap = new HashMap<>(); - try (InputStream inputStream = WebPageOperator.class.getResourceAsStream("/otlp-config.yaml")) { - if (inputStream == null) { - log.warn("otlp-config.yaml not found in resources, using default OTLP configuration"); - return configMap; - } - - Yaml yaml = new Yaml(); - Map yamlData = yaml.load(inputStream); - - // Navigate to otlp section and map properties directly - Map otlp = (Map) yamlData.get("otlp"); - if (otlp != null) { - otlp.forEach((key, value) -> configMap.put("otlp." + key, value.toString())); - } - - log.info("Loaded OTLP configuration from otlp-config.yaml: {}", configMap); - } catch (IOException e) { - log.error("Error loading otlp-config.yaml", e); - } - return configMap; - } } From 19b68cb8eb5eff3d2e606f4732471dc42e7358c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20M=C3=A9sz=C3=A1ros?= Date: Wed, 11 Feb 2026 09:19:17 +0100 Subject: [PATCH 25/25] wip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Attila Mészáros --- .../operator/sample/metrics/MetricsHandlingReconciler.java | 4 ++-- .../sample/metrics/MetricsHandlingSampleOperator.java | 2 +- .../operator/sample/metrics/customresource/WebPage.java | 2 +- .../operator/sample/metrics/customresource/WebPageSpec.java | 2 +- .../operator/sample/metrics/customresource/WebPageStatus.java | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/MetricsHandlingReconciler.java b/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/MetricsHandlingReconciler.java index c80faf00a0..979567b0c0 100644 --- a/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/MetricsHandlingReconciler.java +++ b/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/MetricsHandlingReconciler.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.javaoperatorsdk.operator.sample; +package io.javaoperatorsdk.operator.sample.metrics; import java.util.List; @@ -24,7 +24,7 @@ import io.javaoperatorsdk.operator.api.reconciler.*; import io.javaoperatorsdk.operator.api.reconciler.Context; import io.javaoperatorsdk.operator.processing.event.source.EventSource; -import io.javaoperatorsdk.operator.sample.customresource.WebPage; +import io.javaoperatorsdk.operator.sample.metrics.customresource.WebPage; @ControllerConfiguration public class MetricsHandlingReconciler implements Reconciler { diff --git a/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/MetricsHandlingSampleOperator.java b/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/MetricsHandlingSampleOperator.java index 053f031cc0..05259fe8a9 100644 --- a/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/MetricsHandlingSampleOperator.java +++ b/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/MetricsHandlingSampleOperator.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.javaoperatorsdk.operator.sample; +package io.javaoperatorsdk.operator.sample.metrics; import java.io.IOException; import java.io.InputStream; diff --git a/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPage.java b/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPage.java index 10d7a9cf43..3cbfdcc891 100644 --- a/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPage.java +++ b/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPage.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.javaoperatorsdk.operator.sample.customresource; +package io.javaoperatorsdk.operator.sample.metrics.customresource; import io.fabric8.kubernetes.api.model.Namespaced; import io.fabric8.kubernetes.client.CustomResource; diff --git a/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPageSpec.java b/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPageSpec.java index ef70acea26..5786bac357 100644 --- a/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPageSpec.java +++ b/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPageSpec.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.javaoperatorsdk.operator.sample.customresource; +package io.javaoperatorsdk.operator.sample.metrics.customresource; public class WebPageSpec { diff --git a/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPageStatus.java b/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPageStatus.java index 76de64e645..22e776a73d 100644 --- a/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPageStatus.java +++ b/sample-operators/metrics-processing/src/main/java/io/javaoperatorsdk/operator/sample/metrics/customresource/WebPageStatus.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.javaoperatorsdk.operator.sample.customresource; +package io.javaoperatorsdk.operator.sample.metrics.customresource; public class WebPageStatus {