Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ DEFAULT_VERSION := 99.0.0
VERSION ?= $(DEFAULT_VERSION) # the version of the operator
OPERATOR_SDK_VERSION ?= v1.35.0
ENVTEST_K8S_VERSION = 1.32 #refers to the version of kubebuilder assets to be downloaded by envtest binary # Kubernetes version from OpenShift 4.19.x
GOLANGCI_LINT_VERSION ?= v2.6.1
GOLANGCI_LINT_VERSION ?= v2.9.0
KUSTOMIZE_VERSION ?= v5.2.1
CONTROLLER_TOOLS_VERSION ?= v0.16.5
OPM_VERSION ?= v1.23.0
Expand Down Expand Up @@ -47,7 +47,7 @@ IMAGE_TAG_BASE ?= openshift.io/oadp-operator
BUNDLE_IMG ?= $(IMAGE_TAG_BASE)-bundle:v$(VERSION)

# BUNDLE_GEN_FLAGS are the flags passed to the operator-sdk generate bundle command
BUNDLE_GEN_FLAGS ?= -q --extra-service-accounts "velero,non-admin-controller,oadp-vm-file-restore-controller-manager" --overwrite --version $(VERSION) $(BUNDLE_METADATA_OPTS)
BUNDLE_GEN_FLAGS ?= -q --extra-service-accounts "velero,non-admin-controller,oadp-vm-file-restore-controller-manager,oadp-kubevirt-datamover-controller-manager" --overwrite --version $(VERSION) $(BUNDLE_METADATA_OPTS)

# USE_IMAGE_DIGESTS defines if images are resolved via tags or digests
# You can enable this value if you would like to use SHA Based Digests
Expand Down Expand Up @@ -472,7 +472,7 @@ TMP_DIR=$$(mktemp -d) ;\
cd $$TMP_DIR ;\
go mod init tmp ;\
echo "Downloading $(2) to branch directory" ;\
GOBIN=$(dir $(1)) go install -mod=mod $(2) ;\
GOBIN=$(dir $(1)) go install -a -mod=mod $(2) ;\
rm -rf $$TMP_DIR ;\
}
endef
Expand Down
9 changes: 5 additions & 4 deletions api/v1alpha1/dataprotectionapplication_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,11 @@ const ReconcileCompleteMessage = "Reconcile complete"

// Readiness Conditions
const (
ConditionVeleroReady = "VeleroReady"
ConditionNodeAgentReady = "NodeAgentReady"
ConditionNonAdminReady = "NonAdminReady"
ConditionVMFileRestoreReady = "VMFileRestoreReady"
ConditionVeleroReady = "VeleroReady"
ConditionNodeAgentReady = "NodeAgentReady"
ConditionNonAdminReady = "NonAdminReady"
ConditionVMFileRestoreReady = "VMFileRestoreReady"
ConditionKubevirtDatamoverReady = "KubevirtDatamoverReady"
)

// Readiness condition reasons
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
creationTimestamp: null
name: oadp-kubevirt-datamover-metrics-reader
rules:
- nonResourceURLs:
- /metrics
verbs:
- get
109 changes: 109 additions & 0 deletions bundle/manifests/oadp-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -870,6 +870,78 @@ spec:
verbs:
- get
serviceAccountName: non-admin-controller
- rules:
- apiGroups:
- ""
resources:
- persistentvolumeclaims
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- backup.kubevirt.io
resources:
- virtualmachinebackups
- virtualmachinebackuptrackers
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- backup.kubevirt.io
resources:
- virtualmachinebackups/status
- virtualmachinebackuptrackers/status
verbs:
- get
- apiGroups:
- kubevirt.io
resources:
- virtualmachines
verbs:
- get
- list
- watch
- apiGroups:
- velero.io
resources:
- datauploads
verbs:
- get
- list
- patch
- update
- watch
- apiGroups:
- velero.io
resources:
- datauploads/status
verbs:
- get
- patch
- update
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create
serviceAccountName: oadp-kubevirt-datamover-controller-manager
- rules:
- apiGroups:
- ""
Expand Down Expand Up @@ -1342,6 +1414,8 @@ spec:
value: quay.io/konveyor/oadp-vmfr-access-filebrowser:latest
- name: RELATED_IMAGE_CONSOLE_CLI_DOWNLOAD
value: quay.io/konveyor/oadp-cli-binaries:latest
- name: RELATED_IMAGE_KUBEVIRT_DATAMOVER_CONTROLLER
value: quay.io/konveyor/kubevirt-datamover-controller:latest
image: quay.io/konveyor/oadp-operator:latest
imagePullPolicy: Always
livenessProbe:
Expand Down Expand Up @@ -1398,6 +1472,39 @@ spec:
- emptyDir: {}
name: tmp-dir
permissions:
- rules:
- apiGroups:
- ""
resources:
- configmaps
verbs:
- get
- list
- watch
- create
- update
- patch
- delete
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- get
- list
- watch
- create
- update
- patch
- delete
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
serviceAccountName: oadp-kubevirt-datamover-controller-manager
- rules:
- apiGroups:
- ""
Expand Down Expand Up @@ -1547,4 +1654,6 @@ spec:
name: vm-file-restore-browser
- image: quay.io/konveyor/oadp-cli-binaries:latest
name: console-cli-download
- image: quay.io/konveyor/kubevirt-datamover-controller:latest
name: kubevirt-datamover-controller
version: 99.0.0
2 changes: 2 additions & 0 deletions config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ spec:
value: quay.io/konveyor/oadp-vmfr-access-filebrowser:latest
- name: RELATED_IMAGE_CONSOLE_CLI_DOWNLOAD
value: quay.io/konveyor/oadp-cli-binaries:latest
- name: RELATED_IMAGE_KUBEVIRT_DATAMOVER_CONTROLLER
value: quay.io/konveyor/kubevirt-datamover-controller:latest
args:
- --leader-elect
image: controller:latest
Expand Down
1 change: 1 addition & 0 deletions config/manifests/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ resources:
- ../velero
- ../non-admin-controller_rbac
- ../vm-file-restore-controller_rbac
- ../kubevirt-datamover-controller_rbac

# [WEBHOOK] To enable webhooks, uncomment all the sections with [WEBHOOK] prefix.
# Do NOT uncomment sections with prefix [CERTMANAGER], as OLM does not support cert-manager.
Expand Down
142 changes: 142 additions & 0 deletions docs/design/tech-debt/controller-state-management.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# Controller State Management - Known Technical Debt

## Overview

Several controllers in the OADP operator use package-level variables to track state across reconciliation loops. While this pattern is established and functional in practice, it technically represents a data race when multiple reconciliations run concurrently.

## Affected Controllers

The following controllers use mutable package-level state without synchronization:

### 1. KubeVirt DataMover Controller
**File**: `internal/controller/kubevirt_datamover_controller.go:46-47`

```go
var (
kdmDpaResourceVersion = ""
previousKubevirtDatamoverEnabled = false
)
```

Used in `ensureKubevirtDatamoverRequiredSpecs()` at lines 181-184 to track DPA resource version and plugin enablement state.

### 2. VM File Restore Controller
**File**: `internal/controller/vmfilerestore_controller.go:47-48`

```go
var (
vmfrDpaResourceVersion = ""
previousVMFileRestoreConfiguration *oadpv1alpha1.VMFileRestore = nil
)
```

Used to track DPA resource version and VM file restore configuration changes.

### 3. Non-Admin Controller
**File**: `internal/controller/nonadmin_controller.go:46-48`

```go
var (
dpaResourceVersion = ""
previousNonAdminConfiguration *oadpv1alpha1.NonAdmin = nil
previousDefaultBSLSyncPeriod *time.Duration = nil
)
```

Used to track DPA resource version, non-admin configuration, and BSL sync period.

## Technical Race Condition

### The Issue

Controller-runtime can execute concurrent reconciliations of the same DPA resource when:
- The DPA is updated rapidly (multiple events queued)
- Periodic resyncs overlap with event-driven reconciliations
- Status updates trigger new reconciles while one is in-flight
- Manual reconciliation is triggered

Multiple goroutines can read/write these package-level variables simultaneously without synchronization, creating a data race.

### Example Race Scenario

```go
// Goroutine 1: Reconcile DPA v1
if len(kdmDpaResourceVersion) == 0 || ... { // READ
kdmDpaResourceVersion = "v1" // WRITE
}

// Goroutine 2: Reconcile DPA v2 (concurrent)
if len(kdmDpaResourceVersion) == 0 || ... { // READ (interleaved)
kdmDpaResourceVersion = "v2" // WRITE (interleaved)
}
```

## Why This Works in Practice

1. **DPA Singleton Validation**: The validator at `internal/controller/validator.go:33-34` ensures only one DPA CR exists per namespace:
```go
if len(dpaList.Items) > 1 {
return false, errors.New("only one DPA CR can exist per OADP installation namespace")
}
```

2. **Low Contention**: With a single DPA, reconciliation events are relatively infrequent.

3. **Simple State**: The cached state is just resource versions and configuration snapshots, not complex data structures.

4. **Controller-Runtime Behavior**: In practice, controller-runtime may serialize reconciles more than the API guarantees.

## Potential Fixes (Future Work)

If this race becomes problematic, any fix must be applied **consistently across all three controllers**. Options include:

### Option 1: Add Mutex Synchronization
```go
type DataProtectionApplicationReconciler struct {
// ... existing fields ...
kdmMutex sync.RWMutex
kdmDpaResourceVersion string
previousKubevirtDatamoverEnabled bool
}

// In reconcile logic:
r.kdmMutex.Lock()
defer r.kdmMutex.Unlock()
if len(r.kdmDpaResourceVersion) == 0 || ... {
r.kdmDpaResourceVersion = dpa.GetResourceVersion()
}
```

**Note**: No controllers in the codebase currently use mutexes on reconciler structs. Mutexes are only used locally within functions for explicit goroutine parallelism (e.g., `dataprotectiontest_controller.go:522`).

### Option 2: Store in DPA Annotations (Recommended)
Remove in-memory state entirely and read from Kubernetes API each time:
```go
podAnnotations := map[string]string{
kdmDpaResourceVersionAnnotation: dpa.GetResourceVersion(), // Always use current
}
```

This is more idiomatic for Kubernetes operators and eliminates the race.

### Option 3: Per-Namespace State Map
```go
type DataProtectionApplicationReconciler struct {
kdmStateMutex sync.RWMutex
kdmState map[string]*kdmState // namespace -> state
}
```

## Decision

**Status**: Documented, no immediate action required

**Rationale**: This is an established pattern across multiple controllers. The singleton DPA constraint and low reconciliation frequency make the race low-risk in practice. Any fix would require coordinated changes across three controllers.

**Future Action**: If Go's race detector flags this in CI, or if concurrent reconciliation issues arise, implement Option 2 (DPA annotations) consistently across all affected controllers.

## References

- Singleton validation: `internal/controller/validator.go:27-35`
- Controller setup: `cmd/main.go:272-277`
- Controller-runtime concurrency: https://github.com/kubernetes-sigs/controller-runtime/blob/main/FAQ.md#q-how-do-i-have-different-logic-in-my-reconciler-for-different-types-of-events-eg-create-update-delete
Loading