-
Notifications
You must be signed in to change notification settings - Fork 3.3k
improvement(helm): update GPU device plugin and add cert-manager issuers #3036
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3c6f4e8
549edba
43a8785
be8abee
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,84 @@ | ||
| {{- if .Values.certManager.enabled }} | ||
| {{- /* | ||
| cert-manager Issuer Bootstrap Pattern | ||
|
|
||
| PREREQUISITE: cert-manager must be installed in your cluster before enabling this. | ||
| The root CA Certificate is created in the namespace specified by certManager.rootCA.namespace | ||
| (defaults to "cert-manager"). Ensure this namespace exists and cert-manager is running there. | ||
|
|
||
| Install cert-manager: https://cert-manager.io/docs/installation/ | ||
|
|
||
| This implements the recommended pattern from cert-manager documentation: | ||
| 1. A self-signed ClusterIssuer (for bootstrapping the root CA only) | ||
| 2. A root CA Certificate (self-signed, used to sign other certificates) | ||
| 3. A CA ClusterIssuer (uses the root CA to sign certificates) | ||
|
|
||
| Reference: https://cert-manager.io/docs/configuration/selfsigned/ | ||
| */ -}} | ||
|
|
||
| --- | ||
| # 1. Self-Signed ClusterIssuer (Bootstrap Only) | ||
| # This issuer is used ONLY to create the root CA certificate. | ||
| # It should NOT be used directly for application certificates. | ||
| apiVersion: cert-manager.io/v1 | ||
| kind: ClusterIssuer | ||
| metadata: | ||
| name: {{ .Values.certManager.selfSignedIssuer.name }} | ||
| labels: | ||
| {{- include "sim.labels" . | nindent 4 }} | ||
| app.kubernetes.io/component: cert-manager | ||
| spec: | ||
| selfSigned: {} | ||
|
|
||
| --- | ||
| # 2. Root CA Certificate | ||
| # This certificate is signed by the self-signed issuer and becomes the root of trust. | ||
| # The secret created here will be used by the CA issuer to sign certificates. | ||
| # NOTE: This must be created in the cert-manager namespace (or the namespace specified | ||
| # in certManager.rootCA.namespace). Ensure cert-manager is installed there first. | ||
| apiVersion: cert-manager.io/v1 | ||
| kind: Certificate | ||
| metadata: | ||
| name: {{ .Values.certManager.rootCA.certificateName }} | ||
| namespace: {{ .Values.certManager.rootCA.namespace | default "cert-manager" }} # Must match cert-manager's cluster-resource-namespace | ||
| labels: | ||
| {{- include "sim.labels" . | nindent 4 }} | ||
| app.kubernetes.io/component: cert-manager | ||
| spec: | ||
| isCA: true | ||
| commonName: {{ .Values.certManager.rootCA.commonName }} | ||
| secretName: {{ .Values.certManager.rootCA.secretName }} | ||
| duration: {{ .Values.certManager.rootCA.duration | default "87600h" }} | ||
| renewBefore: {{ .Values.certManager.rootCA.renewBefore | default "2160h" }} | ||
| privateKey: | ||
| algorithm: {{ .Values.certManager.rootCA.privateKey.algorithm | default "RSA" }} | ||
| size: {{ .Values.certManager.rootCA.privateKey.size | default 4096 }} | ||
| subject: | ||
| organizations: | ||
| {{- if .Values.certManager.rootCA.subject.organizations }} | ||
| {{- toYaml .Values.certManager.rootCA.subject.organizations | nindent 6 }} | ||
| {{- else }} | ||
| - {{ .Release.Name }} | ||
| {{- end }} | ||
| issuerRef: | ||
| name: {{ .Values.certManager.selfSignedIssuer.name }} | ||
| kind: ClusterIssuer | ||
| group: cert-manager.io | ||
|
|
||
| --- | ||
| # 3. CA ClusterIssuer | ||
| # This is the issuer that should be used by applications to obtain certificates. | ||
| # It signs certificates using the root CA created above. | ||
| # NOTE: This issuer may briefly show "not ready" on first install while cert-manager | ||
| # processes the Certificate above and creates the secret. It will auto-reconcile. | ||
| apiVersion: cert-manager.io/v1 | ||
| kind: ClusterIssuer | ||
| metadata: | ||
| name: {{ .Values.certManager.caIssuer.name }} | ||
| labels: | ||
| {{- include "sim.labels" . | nindent 4 }} | ||
| app.kubernetes.io/component: cert-manager | ||
| spec: | ||
| ca: | ||
| secretName: {{ .Values.certManager.rootCA.secretName }} | ||
| {{- end }} | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,36 @@ | ||
| {{- if and .Values.ollama.enabled .Values.ollama.gpu.enabled }} | ||
| --- | ||
| # NVIDIA Device Plugin DaemonSet for GPU support | ||
| # 1. ConfigMap for NVIDIA Device Plugin Configuration | ||
| apiVersion: v1 | ||
| kind: ConfigMap | ||
| metadata: | ||
| name: {{ include "sim.fullname" . }}-nvidia-device-plugin-config | ||
| namespace: {{ .Release.Namespace }} | ||
| labels: | ||
| {{- include "sim.labels" . | nindent 4 }} | ||
| app.kubernetes.io/component: nvidia-device-plugin | ||
| data: | ||
| config.yaml: | | ||
| version: v1 | ||
| flags: | ||
| {{- if eq .Values.ollama.gpu.strategy "mig" }} | ||
| migStrategy: "single" | ||
| {{- else }} | ||
| migStrategy: "none" | ||
| {{- end }} | ||
| failOnInitError: false | ||
| plugin: | ||
| passDeviceSpecs: true | ||
| deviceListStrategy: envvar | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Invalid config structure for NVIDIA device plugin settingsMedium Severity The ConfigMap places |
||
| {{- if eq .Values.ollama.gpu.strategy "time-slicing" }} | ||
| sharing: | ||
| timeSlicing: | ||
| resources: | ||
| - name: nvidia.com/gpu | ||
| replicas: {{ .Values.ollama.gpu.timeSlicingReplicas | default 5 }} | ||
| {{- end }} | ||
| --- | ||
| # 2. NVIDIA Device Plugin DaemonSet for GPU support | ||
| apiVersion: apps/v1 | ||
| kind: DaemonSet | ||
| metadata: | ||
|
|
@@ -35,9 +65,6 @@ spec: | |
| # Only schedule on nodes with NVIDIA GPUs | ||
| accelerator: nvidia | ||
| priorityClassName: system-node-critical | ||
| runtimeClassName: nvidia | ||
| hostNetwork: true | ||
| hostPID: true | ||
| volumes: | ||
| - name: device-plugin | ||
| hostPath: | ||
|
|
@@ -48,22 +75,21 @@ spec: | |
| - name: sys | ||
| hostPath: | ||
| path: /sys | ||
| - name: proc-driver-nvidia | ||
| hostPath: | ||
| path: /proc/driver/nvidia | ||
| # Volume to mount the ConfigMap | ||
| - name: nvidia-device-plugin-config | ||
| configMap: | ||
| name: {{ include "sim.fullname" . }}-nvidia-device-plugin-config | ||
| containers: | ||
| - name: nvidia-device-plugin | ||
| image: nvcr.io/nvidia/k8s-device-plugin:v0.14.5 | ||
| image: nvcr.io/nvidia/k8s-device-plugin:v0.18.2 | ||
| imagePullPolicy: Always | ||
| args: | ||
| - --mig-strategy=single | ||
| - --pass-device-specs=true | ||
| - --fail-on-init-error=false | ||
| - --device-list-strategy=envvar | ||
| - --nvidia-driver-root=/host-sys/fs/cgroup | ||
| - "--config-file=/etc/device-plugin/config.yaml" | ||
| {{- if eq .Values.ollama.gpu.strategy "mig" }} | ||
| env: | ||
| - name: NVIDIA_MIG_MONITOR_DEVICES | ||
| value: all | ||
| {{- end }} | ||
| securityContext: | ||
| allowPrivilegeEscalation: false | ||
| capabilities: | ||
|
|
@@ -74,29 +100,16 @@ spec: | |
| - name: dev | ||
| mountPath: /dev | ||
| - name: sys | ||
| mountPath: /host-sys | ||
| mountPath: /sys | ||
| readOnly: true | ||
| - name: proc-driver-nvidia | ||
| mountPath: /proc/driver/nvidia | ||
| - name: nvidia-device-plugin-config | ||
| mountPath: /etc/device-plugin/ | ||
| readOnly: true | ||
| resources: | ||
| requests: | ||
| cpu: 50m | ||
| memory: 10Mi | ||
| memory: 20Mi | ||
| limits: | ||
| cpu: 50m | ||
| memory: 20Mi | ||
| {{- if .Values.nodeSelector }} | ||
| nodeSelector: | ||
| {{- toYaml .Values.nodeSelector | nindent 8 }} | ||
| {{- end }} | ||
| --- | ||
| # RuntimeClass for NVIDIA Container Runtime | ||
| apiVersion: node.k8s.io/v1 | ||
waleedlatif1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| kind: RuntimeClass | ||
| metadata: | ||
| name: {{ include "sim.fullname" . }}-nvidia | ||
| labels: | ||
| {{- include "sim.labels" . | nindent 4 }} | ||
| handler: nvidia | ||
| {{- end }} | ||
| memory: 50Mi | ||
| {{- end }} | ||
Uh oh!
There was an error while loading. Please reload this page.