From 63326effb3909dbe9cf6b58e490c44801bcee0a0 Mon Sep 17 00:00:00 2001 From: Stavros Kois <47820033+stavros-k@users.noreply.github.com> Date: Thu, 20 Apr 2023 01:27:23 +0300 Subject: [PATCH] NAS-121481 / 23.10 / Set nvidia caps to void when no gpu is passed, also adds `render` group when a gpu is selected and other small fixes (#1124) * Set nvidia caps to void when no gpu is passed * add tests to init containers too * Additionally add `render` group when gpu is added * Correctly handle "0" gpu * handle fsGroup 0 properly * fix gh highlight * Correct nvidia variable and add additional check for runtime * cast both sides of the comparison * fix externalinterfaces nesting * Add dnsConfig missing docs --- .../tests/container/envFixed_test.yaml | 10 ++ .../tests/container/resources_test.yaml | 166 ++++++++++++++++++ .../externalInterface/validation_test.yaml | 42 ++--- .../tests/initContainer/data_test.yaml | 32 ++++ .../initContainer/data_upgrade_test.yaml | 32 ++++ .../tests/pod/runtime_class_name_test.yaml | 20 +++ .../tests/pod/securityContext.yaml | 28 +++ library/common/Chart.yaml | 2 +- library/common/docs/scaleExternalInterface.md | 30 ++-- library/common/docs/scaleGPU.md | 2 +- library/common/docs/workload/README.md | 86 +++++---- .../templates/helpers/_getPortRange.tpl | 7 +- .../templates/lib/container/_fixedEnv.tpl | 2 + .../templates/lib/container/_resources.tpl | 6 +- .../lib/externalInterface/_validation.tpl | 14 +- .../templates/lib/pod/_podSecurityContext.tpl | 4 +- .../templates/lib/pod/_runtimeClassName.tpl | 25 ++- 17 files changed, 408 insertions(+), 100 deletions(-) diff --git a/library/common-test/tests/container/envFixed_test.yaml b/library/common-test/tests/container/envFixed_test.yaml index fd0f7a0047..7ac00db8bb 100644 --- a/library/common-test/tests/container/envFixed_test.yaml +++ b/library/common-test/tests/container/envFixed_test.yaml @@ -45,6 +45,8 @@ tests: value: "002" - name: UMASK_SET value: "002" + - name: NVIDIA_VISIBLE_DEVICES + value: "void" - name: S6_READ_ONLY_ROOT value: "1" @@ -87,6 +89,8 @@ tests: value: "002" - name: UMASK_SET value: "002" + - name: NVIDIA_VISIBLE_DEVICES + value: "void" - name: PUID value: "568" - name: USER_ID @@ -143,6 +147,8 @@ tests: value: "002" - name: UMASK_SET value: "002" + - name: NVIDIA_VISIBLE_DEVICES + value: "void" - name: PUID value: "568" - name: USER_ID @@ -198,6 +204,8 @@ tests: value: "002" - name: UMASK_SET value: "002" + - name: NVIDIA_VISIBLE_DEVICES + value: "void" - name: PUID value: "568" - name: USER_ID @@ -351,6 +359,8 @@ tests: value: "002" - name: UMASK_SET value: "002" + - name: NVIDIA_VISIBLE_DEVICES + value: "void" - name: PUID value: "0" - name: USER_ID diff --git a/library/common-test/tests/container/resources_test.yaml b/library/common-test/tests/container/resources_test.yaml index 6dbb6349c9..a2924eae05 100644 --- a/library/common-test/tests/container/resources_test.yaml +++ b/library/common-test/tests/container/resources_test.yaml @@ -628,6 +628,147 @@ tests: cpu: 10m memory: 50Mi + - it: should assign GPU on the selected pod/container with multiple GPUs + set: + image: *image + global: + ixChartContext: + addNvidiaRuntimeClass: true + nvidiaRuntimeClassName: nvidia + scaleGPU: + - gpu: + nvidia.com/gpu: 1 + amd.com/gpu: 0 + targetSelector: + workload-name1: + - container-name1 + - container-name2 + workload: + workload-name1: + enabled: true + primary: true + type: Deployment + podSpec: + containers: + container-name1: + enabled: true + primary: true + imageSelector: image + probes: *probes + container-name2: + enabled: true + primary: false + imageSelector: image + probes: *probes + asserts: + - documentIndex: &deploymentDoc 0 + isKind: + of: Deployment + - documentIndex: *deploymentDoc + isAPIVersion: + of: apps/v1 + - documentIndex: *deploymentDoc + equal: + path: spec.template.spec.runtimeClassName + value: nvidia + - documentIndex: *deploymentDoc + isSubset: + path: spec.template.spec.containers[0] + content: + resources: + limits: + cpu: 4000m + memory: 8Gi + nvidia.com/gpu: "1" + requests: + cpu: 10m + memory: 50Mi + - documentIndex: *deploymentDoc + isSubset: + path: spec.template.spec.containers[1] + content: + resources: + limits: + cpu: 4000m + memory: 8Gi + nvidia.com/gpu: "1" + requests: + cpu: 10m + memory: 50Mi + + - it: should assign multiple GPU on the selected pod/container with multiple selected GPUs + set: + image: *image + global: + ixChartContext: + addNvidiaRuntimeClass: true + nvidiaRuntimeClassName: nvidia + scaleGPU: + - gpu: + nvidia.com/gpu: 1 + amd.com/gpu: 0 + targetSelector: + workload-name1: + - container-name1 + - gpu: + nvidia.com/gpu: 0 + amd.com/gpu: 1 + targetSelector: + workload-name1: + - container-name2 + workload: + workload-name1: + enabled: true + primary: true + type: Deployment + podSpec: + containers: + container-name1: + enabled: true + primary: true + imageSelector: image + probes: *probes + container-name2: + enabled: true + primary: false + imageSelector: image + probes: *probes + asserts: + - documentIndex: &deploymentDoc 0 + isKind: + of: Deployment + - documentIndex: *deploymentDoc + isAPIVersion: + of: apps/v1 + - documentIndex: *deploymentDoc + equal: + path: spec.template.spec.runtimeClassName + value: nvidia + - documentIndex: *deploymentDoc + isSubset: + path: spec.template.spec.containers[0] + content: + resources: + limits: + cpu: 4000m + memory: 8Gi + nvidia.com/gpu: "1" + requests: + cpu: 10m + memory: 50Mi + - documentIndex: *deploymentDoc + isSubset: + path: spec.template.spec.containers[1] + content: + resources: + limits: + cpu: 4000m + memory: 8Gi + amd.com/gpu: "1" + requests: + cpu: 10m + memory: 50Mi + # Failures - it: should fail with empty requests set: @@ -868,3 +1009,28 @@ tests: asserts: - failedTemplate: errorMessage: Container - Expected non-empty + + - it: should fail with no value in gpu + set: + image: *image + scaleGPU: + - gpu: + key: + targetSelector: + workload-name1: + - container-name1 + workload: + workload-name1: + enabled: true + primary: true + type: Deployment + podSpec: + containers: + container-name1: + enabled: true + primary: true + imageSelector: image + probes: *probes + asserts: + - failedTemplate: + errorMessage: Container - Expected non-empty diff --git a/library/common-test/tests/externalInterface/validation_test.yaml b/library/common-test/tests/externalInterface/validation_test.yaml index 0ebaaae149..fc5222156f 100644 --- a/library/common-test/tests/externalInterface/validation_test.yaml +++ b/library/common-test/tests/externalInterface/validation_test.yaml @@ -55,11 +55,11 @@ tests: - hostInterface: enp0s3 ipam: type: dhcp - staticIPConfigurations: - - ipAddress: 1.2.3.4 + staticIPConfigurations: + - ipAddress: 1.2.3.4 asserts: - failedTemplate: - errorMessage: External Interface - Expected empty and when is not [static] + errorMessage: External Interface - Expected empty and when is not [static] - it: should fail with non-empty staticRoutes on dhcp set: @@ -67,12 +67,12 @@ tests: - hostInterface: enp0s3 ipam: type: dhcp - staticRoutes: - - gateway: 1.2.3.4 - destination: 1.2.3.4 + staticRoutes: + - gateway: 1.2.3.4 + destination: 1.2.3.4 asserts: - failedTemplate: - errorMessage: External Interface - Expected empty and when is not [static] + errorMessage: External Interface - Expected empty and when is not [static] - it: should fail with empty staticIPConfigurations on static set: @@ -80,10 +80,10 @@ tests: - hostInterface: enp0s3 ipam: type: static - staticIPConfigurations: [] + staticIPConfigurations: [] asserts: - failedTemplate: - errorMessage: External Interface - Expected non-empty when is [static] + errorMessage: External Interface - Expected non-empty when is [static] - it: should fail with empty gateway on staticRoutes on static set: @@ -91,14 +91,14 @@ tests: - hostInterface: enp0s3 ipam: type: static - staticIPConfigurations: - - ipAddress: 1.2.3.4 - staticRoutes: - - gateway: "" - destination: 1.2.3.4 + staticIPConfigurations: + - ipAddress: 1.2.3.4 + staticRoutes: + - gateway: "" + destination: 1.2.3.4 asserts: - failedTemplate: - errorMessage: External Interface - Expected non-empty in + errorMessage: External Interface - Expected non-empty in - it: should fail with empty destination on staticRoutes on static set: @@ -106,14 +106,14 @@ tests: - hostInterface: enp0s3 ipam: type: static - staticIPConfigurations: - - ipAddress: 1.2.3.4 - staticRoutes: - - gateway: 1.2.3.4 - destination: "" + staticIPConfigurations: + - ipAddress: 1.2.3.4 + staticRoutes: + - gateway: 1.2.3.4 + destination: "" asserts: - failedTemplate: - errorMessage: External Interface - Expected non-empty in + errorMessage: External Interface - Expected non-empty in - it: should fail with empty ixExternalInterfaceConfigurationNames when interface is defined set: diff --git a/library/common-test/tests/initContainer/data_test.yaml b/library/common-test/tests/initContainer/data_test.yaml index d6e5556b0f..e156cbafe2 100644 --- a/library/common-test/tests/initContainer/data_test.yaml +++ b/library/common-test/tests/initContainer/data_test.yaml @@ -45,6 +45,14 @@ tests: enabled: "{{ .Values.render }}" type: init imageSelector: initImage + env: + key: value + key2: "{{ .Values.initImage.repository }}" + key3: + secretKeyRef: + expandObjectName: false + name: '{{ printf "secret-name" }}' + key: secret-key container-name3: enabled: true type: upgrade @@ -101,6 +109,30 @@ tests: path: spec.template.spec.initContainers[1] content: name: release-name-common-test-init-container-name1 + - documentIndex: *deploymentDoc + isSubset: + path: spec.template.spec.initContainers[1] + content: + env: + - name: "TZ" + value: "UTC" + - name: "UMASK" + value: "002" + - name: "UMASK_SET" + value: "002" + - name: NVIDIA_VISIBLE_DEVICES + value: "void" + - name: "S6_READ_ONLY_ROOT" + value: "1" + - name: "key" + value: "value" + - name: "key2" + value: "bash" + - name: "key3" + valueFrom: + secretKeyRef: + key: "secret-key" + name: "secret-name" - documentIndex: *deploymentDoc isNull: path: spec.template.spec.initContainers[1].command diff --git a/library/common-test/tests/initContainer/data_upgrade_test.yaml b/library/common-test/tests/initContainer/data_upgrade_test.yaml index 5e99ad728e..40f7d9486f 100644 --- a/library/common-test/tests/initContainer/data_upgrade_test.yaml +++ b/library/common-test/tests/initContainer/data_upgrade_test.yaml @@ -47,6 +47,14 @@ tests: enabled: true type: init imageSelector: initImage + env: + key: value + key2: "{{ .Values.initImage.repository }}" + key3: + secretKeyRef: + expandObjectName: false + name: '{{ printf "secret-name" }}' + key: secret-key container-name3: enabled: true type: install @@ -109,6 +117,30 @@ tests: - documentIndex: *deploymentDoc isNull: path: spec.template.spec.initContainers[1].volumeMounts + - documentIndex: *deploymentDoc + isSubset: + path: spec.template.spec.initContainers[1] + content: + env: + - name: "TZ" + value: "UTC" + - name: "UMASK" + value: "002" + - name: "UMASK_SET" + value: "002" + - name: NVIDIA_VISIBLE_DEVICES + value: "void" + - name: "S6_READ_ONLY_ROOT" + value: "1" + - name: "key" + value: "value" + - name: "key2" + value: "bash" + - name: "key3" + valueFrom: + secretKeyRef: + key: "secret-key" + name: "secret-name" - documentIndex: *deploymentDoc isNull: path: spec.template.spec.initContainers[2] diff --git a/library/common-test/tests/pod/runtime_class_name_test.yaml b/library/common-test/tests/pod/runtime_class_name_test.yaml index 0447dfc3df..7e30c4a1ec 100644 --- a/library/common-test/tests/pod/runtime_class_name_test.yaml +++ b/library/common-test/tests/pod/runtime_class_name_test.yaml @@ -192,3 +192,23 @@ tests: equal: path: spec.template.spec.runtimeClassName value: some-other-class + + - it: should not add runtimeClassName with gpu value 0 + set: + scaleGPU: + - gpu: + key: 0 + global: + ixChartContext: + addNvidiaRuntimeClass: true + nvidiaRuntimeClassName: ix-runtime + workload: + workload-name1: + enabled: true + primary: true + type: Deployment + podSpec: {} + asserts: + - documentIndex: &deploymentDoc 0 + isNull: + path: spec.template.spec.runtimeClassName diff --git a/library/common-test/tests/pod/securityContext.yaml b/library/common-test/tests/pod/securityContext.yaml index 48e51359a4..6da8baaaa3 100644 --- a/library/common-test/tests/pod/securityContext.yaml +++ b/library/common-test/tests/pod/securityContext.yaml @@ -220,6 +220,30 @@ tests: - name: net.ipv4.ip_unprivileged_port_start value: "443" + - it: should pass with fsGroup 0 + set: + securityContext: + pod: + fsGroup: 0 + workload: + workload-name1: + enabled: true + primary: true + type: Deployment + podSpec: {} + asserts: + - documentIndex: &deploymentDoc 0 + isKind: + of: Deployment + - documentIndex: *deploymentDoc + equal: + path: spec.template.spec.securityContext + value: + fsGroup: 0 + fsGroupChangePolicy: OnRootMismatch + supplementalGroups: [] + sysctls: [] + - it: should pass with no sysctls port_start automatically appended based on services when port is higher than 1024 set: workload: @@ -284,6 +308,7 @@ tests: supplementalGroups: - 1000 - 44 + - 107 sysctls: [] - documentIndex: &otherDeploymentDoc 1 isKind: @@ -332,6 +357,7 @@ tests: supplementalGroups: - 1000 - 44 + - 107 sysctls: [] - documentIndex: &otherDeploymentDoc 1 isKind: @@ -382,6 +408,7 @@ tests: supplementalGroups: - 1000 - 44 + - 107 sysctls: [] - documentIndex: &otherDeploymentDoc 1 isKind: @@ -394,6 +421,7 @@ tests: fsGroupChangePolicy: OnRootMismatch supplementalGroups: - 44 + - 107 sysctls: [] # Failures diff --git a/library/common/Chart.yaml b/library/common/Chart.yaml index 300eff1a02..0c2cfff5d6 100644 --- a/library/common/Chart.yaml +++ b/library/common/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: common description: A library chart for iX Official Catalog type: library -version: 1.0.5 +version: 1.0.6 appVersion: v1 annotations: title: Common Library Chart diff --git a/library/common/docs/scaleExternalInterface.md b/library/common/docs/scaleExternalInterface.md index 0da16497dc..3c0cf1e348 100644 --- a/library/common/docs/scaleExternalInterface.md +++ b/library/common/docs/scaleExternalInterface.md @@ -1,18 +1,18 @@ # Scale External Interface -| Key | Type | Required | Helm Template | Default | Description | -| :------------------------------------------------ | :-------: | :-----------------------------: | :-----------: | :-----: | :------------------------------------------------------------------------------ | -| scaleExternalInterface | `list` | ❌ | ❌ | `[]` | Define the external interfaces as list | -| scaleExternalInterface.targetSelectAll | `boolean` | ❌ | ❌ | `false` | Whether to add the annotation for this external interface to all workloads | -| scaleExternalInterface.targetSelector | `list` | ❌ | ❌ | `[]` | Which workloads to add the annotations | -| scaleExternalInterface.hostInterface | `string` | ✅ | ❌ | `""` | Define the hostInterface, (options in GUI populated from Middleware references) | -| scaleExternalInterface.ipam | `dict` | ✅ | ❌ | `{}` | Define the ipam | -| scaleExternalInterface.ipam.type | `string` | ✅ | ❌ | `""` | Define the ipam type (dchp, static) | -| scaleExternalInterface.staticIPConfiguration | `list` | ✅ (Only when static ipam type) | ❌ | `[]` | Define static IP Configuration (Only with static ipam type) | -| scaleExternalInterface.staticIPConfiguration.[IP] | `string` | ✅ | ❌ | `""` | Define the static IP (Only with static ipam type) | -| scaleExternalInterface.staticRoutes | `list` | ❌ | ❌ | `[]` | Define static routes (Only with static ipam type) | -| scaleExternalInterface.staticRoutes.destination | `string` | ✅ | ❌ | `""` | Define the static destination (Only with static ipam type) | -| scaleExternalInterface.staticRoutes.gateway | `string` | ✅ | ❌ | `""` | Define the static gateway (Only with static ipam type) | +| Key | Type | Required | Helm Template | Default | Description | +| :----------------------------------------------------- | :-------: | :-----------------------------: | :-----------: | :-----: | :------------------------------------------------------------------------------ | +| scaleExternalInterface | `list` | ❌ | ❌ | `[]` | Define the external interfaces as list | +| scaleExternalInterface.targetSelectAll | `boolean` | ❌ | ❌ | `false` | Whether to add the annotation for this external interface to all workloads | +| scaleExternalInterface.targetSelector | `list` | ❌ | ❌ | `[]` | Which workloads to add the annotations | +| scaleExternalInterface.hostInterface | `string` | ✅ | ❌ | `""` | Define the hostInterface, (options in GUI populated from Middleware references) | +| scaleExternalInterface.ipam | `dict` | ✅ | ❌ | `{}` | Define the ipam | +| scaleExternalInterface.ipam.type | `string` | ✅ | ❌ | `""` | Define the ipam type (dchp, static) | +| scaleExternalInterface.ipam.staticIPConfiguration | `list` | ✅ (Only when static ipam type) | ❌ | `[]` | Define static IP Configuration (Only with static ipam type) | +| scaleExternalInterface.ipam.staticIPConfiguration.[IP] | `string` | ✅ | ❌ | `""` | Define the static IP (Only with static ipam type) | +| scaleExternalInterface.ipam.staticRoutes | `list` | ❌ | ❌ | `[]` | Define static routes (Only with static ipam type) | +| scaleExternalInterface.ipam.staticRoutes.destination | `string` | ✅ | ❌ | `""` | Define the static destination (Only with static ipam type) | +| scaleExternalInterface.ipam.staticRoutes.gateway | `string` | ✅ | ❌ | `""` | Define the static gateway (Only with static ipam type) | > When `targetSelectAll` is `true`, it will add the annotations to all pods (`targetSelector` is ignored in this case) > When `targetSelector` is a list, each entry is a string, with the pod name that will add the annotations. Can have multiple entries. @@ -39,8 +39,8 @@ scaleExternalInterface: - hostInterface: "" ipam: type: "" - staticRoutes: [] - staticIPConfigurations: [] + staticRoutes: [] + staticIPConfigurations: [] # targetSelectAll: false targetSelector: - workload-name diff --git a/library/common/docs/scaleGPU.md b/library/common/docs/scaleGPU.md index d3c77a9cb8..2496abca9c 100644 --- a/library/common/docs/scaleGPU.md +++ b/library/common/docs/scaleGPU.md @@ -10,7 +10,7 @@ > When `targetSelector` is a dict, each entry is a list, containing the name(s) of the container(s) to assign the GPU > When `targetSelector` is a empty, it will assign the GPU to the primary pod/container -> Selected pod's will get appended the group `44` in `supplementalGroups`. This is to allow rootless containers to access the GPU +> Selected pod's will get appended the group `44` and `107` in `supplementalGroups`. This is to allow rootless containers to access the GPU --- diff --git a/library/common/docs/workload/README.md b/library/common/docs/workload/README.md index 3b6956dae6..da90ce7291 100644 --- a/library/common/docs/workload/README.md +++ b/library/common/docs/workload/README.md @@ -1,44 +1,52 @@ # workload -| Key | Type | Required | Helm Template | Default | Description | -| :------------------------------------------------------------------- | :-------: | :------: | :----------------: | :-------------------------------------------------------------: | :--------------------------------------------------------------------------------- | -| workload | `dict` | ❌ | ❌ | `{}` | Define the workload as dicts | -| workload.[workload-name] | `dict` | ✅ | ❌ | `{}` | Holds workload definition | -| workload.[workload-name].enabled | `boolean` | ✅ | ❌ | `false` | Enables or Disables the workload | -| workload.[workload-name].primary | `boolean` | ✅ | ❌ | `false` | Sets the workload as primary | -| workload.[workload-name].labels | `dict` | ❌ | ✅ (On value only) | `{}` | Additional labels for workload | -| workload.[workload-name].annotations | `dict` | ❌ | ✅ (On value only) | `{}` | Additional annotations for workload | -| workload.[workload-name].type | `string` | ✅ | ❌ | `""` | Define the kind of the workload (Deployment, CronJob, Job) | -| workload.[workload-name].podSpec | `dict` | ✅ | ❌ | `{}` | Holds the pod definition | -| workload.[workload-name].podSpec.labels | `dict` | ❌ | ✅ (On value only) | `{}` | Additional Pod Labels | -| workload.[workload-name].podSpec.annotations | `dict` | ❌ | ✅ (On value only) | `{}` | Pod Annotations | -| workload.[workload-name].podSpec.automountServiceAccountToken | `boolean` | ❌ | ❌ | `{{ .Values.podOptions.automountServiceAccoutnToken }}` (false) | Pod's automountServiceAccountToken | -| workload.[workload-name].podSpec.hostNetwork | `boolean` | ❌ | ❌ | `{{ .Values.podOptions.hostNetwork }}` (false) | Pod's hostNetwork | -| workload.[workload-name].podSpec.enableServiceLinks | `boolean` | ❌ | ❌ | `{{ .Values.podOptions.enableServiceLinks }}` (false) | Pod's enableServiceLinks | -| workload.[workload-name].podSpec.restartPolicy | `string` | ❌ | ✅ | `{{ .Values.podOptions.restartPolicy }}` (Always) | Pod's restartPolicy. (Always, Never, OnFailure) | -| workload.[workload-name].podSpec.hostname | `string` | ❌ | ✅ | `""` | Pod's hostname | -| workload.[workload-name].podSpec.terminationGracePeriodSeconds | `int` | ❌ | ✅ | `{{ .Values.podOptions.terminationGracePeriodSeconds }}` (120) | Pod's terminationGracePeriodSeconds | -| workload.[workload-name].podSpec.hostAliases | `list` | ❌ | ❌ | | Pod's host aliases | -| workload.[workload-name].podSpec.hostAliases.ip | `string` | ❌ | ✅ | | Value for `ip` in hosts aliases | -| workload.[workload-name].podSpec.hostAliases.hostnames | `list` | ❌ | ❌ | | Hostnames for the `ip` in hosts aliases | -| workload.[workload-name].podSpec.hostAliases.hostnames.[host-name] | `string` | ❌ | ✅ | | [Value] for `hostnames` for the `ip` in hosts aliases | -| workload.[workload-name].podSpec.dnsPolicy | `string` | ❌ | ✅ | `{{ .Values.podOptions.dnsPolicy }}` (ClusterFirst) | Pod's DNS Policy (ClusterFirst, ClusterFirstWithHostNet, Default, None). | -| workload.[workload-name].podSpec.tolerations | `list` | ❌ | ❌ | `{{ .Values.podOptions.tolerations }}` ([]) | Pod's Tolerations | -| workload.[workload-name].podSpec.tolerations.operator | `string` | ✅ | ✅ | | Toleration's `operator` (Equal, Exists) | -| workload.[workload-name].podSpec.tolerations.key | `string` | ❌/✅ | ✅ | | Toleration's `key`. Required only when `operator` = `Equal` | -| workload.[workload-name].podSpec.tolerations.value | `string` | ❌/✅ | ✅ | | Toleration's `value`. Required only when `operator` = `Equal` | -| workload.[workload-name].podSpec.tolerations.effect | `string` | ❌ | ✅ | | Toleration's `effect`.(NoExecute, NoSchedule, PreferNoSchedule) | -| workload.[workload-name].podSpec.tolerations.tolerationSeconds | `int` | ❌ | ❌ | | Toleration's `tolerationSeconds`. | -| workload.[workload-name].podSpec.runtimeClassName | `string` | ❌ | ✅ | `{{ .Values.podOptions.runtimeClassName }}` ("") | Pod's runtimeClassName | -| workload.[workload-name].podSpec.securityContext | `dict` | ❌ | ❌ | `{{ .Values.securityContext.pod }}` | Pod's securityContext | -| workload.[workload-name].podSpec.securityContext.fsGroup | `int` | ❌ | ❌ | `568` | Pod's fsGroup | -| workload.[workload-name].podSpec.securityContext.fsGroupChangePolicy | `string` | ❌ | ❌ | `OnRootMismatch` | Pod's fsGroupChangePolicy (Always, OnRootMismatch) | -| workload.[workload-name].podSpec.securityContext.supplementalGroups | `list` | ❌ | ❌ | `[]` | Pod's supplementalGroups (list of `int`) | -| workload.[workload-name].podSpec.securityContext.sysctls | `list` | ❌ | ❌ | `[]` | Pod's sysctls | -| workload.[workload-name].podSpec.securityContext.sysctls.name | `string` | ✅ | ✅ | `""` | `name` of the sysctl | -| workload.[workload-name].podSpec.securityContext.sysctls.value | `string` | ✅ | ✅ | `""` | `value` of the sysctl | -| workload.[workload-name].podSpec.containers | `dict` | ❌ | ❌ | `{}` | Define container(s) | -| workload.[workload-name].podSpec.initContainers | `dict` | ❌ | ❌ | `{}` | Define initContainer(s) | +| Key | Type | Required | Helm Template | Default | Description | +| :------------------------------------------------------------------- | :-------: | :------: | :----------------: | :-------------------------------------------------------------: | :----------------------------------------------------------------------- | +| workload | `dict` | ❌ | ❌ | `{}` | Define the workload as dicts | +| workload.[workload-name] | `dict` | ✅ | ❌ | `{}` | Holds workload definition | +| workload.[workload-name].enabled | `boolean` | ✅ | ❌ | `false` | Enables or Disables the workload | +| workload.[workload-name].primary | `boolean` | ✅ | ❌ | `false` | Sets the workload as primary | +| workload.[workload-name].labels | `dict` | ❌ | ✅ (On value only) | `{}` | Additional labels for workload | +| workload.[workload-name].annotations | `dict` | ❌ | ✅ (On value only) | `{}` | Additional annotations for workload | +| workload.[workload-name].type | `string` | ✅ | ❌ | `""` | Define the kind of the workload (Deployment, CronJob, Job) | +| workload.[workload-name].podSpec | `dict` | ✅ | ❌ | `{}` | Holds the pod definition | +| workload.[workload-name].podSpec.labels | `dict` | ❌ | ✅ (On value only) | `{}` | Additional Pod Labels | +| workload.[workload-name].podSpec.annotations | `dict` | ❌ | ✅ (On value only) | `{}` | Pod Annotations | +| workload.[workload-name].podSpec.automountServiceAccountToken | `boolean` | ❌ | ❌ | `{{ .Values.podOptions.automountServiceAccoutnToken }}` (false) | Pod's automountServiceAccountToken | +| workload.[workload-name].podSpec.hostNetwork | `boolean` | ❌ | ❌ | `{{ .Values.podOptions.hostNetwork }}` (false) | Pod's hostNetwork | +| workload.[workload-name].podSpec.enableServiceLinks | `boolean` | ❌ | ❌ | `{{ .Values.podOptions.enableServiceLinks }}` (false) | Pod's enableServiceLinks | +| workload.[workload-name].podSpec.restartPolicy | `string` | ❌ | ✅ | `{{ .Values.podOptions.restartPolicy }}` (Always) | Pod's restartPolicy. (Always, Never, OnFailure) | +| workload.[workload-name].podSpec.hostname | `string` | ❌ | ✅ | `""` | Pod's hostname | +| workload.[workload-name].podSpec.terminationGracePeriodSeconds | `int` | ❌ | ✅ | `{{ .Values.podOptions.terminationGracePeriodSeconds }}` (120) | Pod's terminationGracePeriodSeconds | +| workload.[workload-name].podSpec.hostAliases | `list` | ❌ | ❌ | | Pod's host aliases | +| workload.[workload-name].podSpec.hostAliases.ip | `string` | ❌ | ✅ | | Value for `ip` in hosts aliases | +| workload.[workload-name].podSpec.hostAliases.hostnames | `list` | ❌ | ❌ | | Hostnames for the `ip` in hosts aliases | +| workload.[workload-name].podSpec.hostAliases.hostnames.[host-name] | `string` | ❌ | ✅ | | [Value] for `hostnames` for the `ip` in hosts aliases | +| workload.[workload-name].podSpec.dnsPolicy | `string` | ❌ | ✅ | `{{ .Values.podOptions.dnsPolicy }}` (ClusterFirst) | Pod's DNS Policy (ClusterFirst, ClusterFirstWithHostNet, Default, None). | +| workload.[workload-name].podSpec.dnsConfig | `dict` | ❌ | ❌ | `{{ .Values.podOptions.dnsConfig }}` | Pod's DNS Config | +| workload.[workload-name].podSpec.dnsConfig.nameservers | `list` | ❌ | ✅ | `[]` | Pod's DNS Config - Nameservers (Max 3) | +| workload.[workload-name].podSpec.dnsConfig.nameservers.nameserver | `string` | ✅ | ✅ | `""` | Pod's DNS Config - Nameserver | +| workload.[workload-name].podSpec.dnsConfig.searches | `list` | ❌ | ✅ | `[]` | Pod's DNS Config - Searches (Max 6) | +| workload.[workload-name].podSpec.dnsConfig.searches.[search] | `string` | ✅ | ✅ | `""` | Pod's DNS Config - Search | +| workload.[workload-name].podSpec.dnsConfig.options | `dict` | ❌ | ❌ | `{}` | Pod's DNS Config - Options | +| workload.[workload-name].podSpec.dnsConfig.options.name | `string` | ✅ | ✅ | `""` | Pod's DNS Config - Option name | +| workload.[workload-name].podSpec.dnsConfig.options.value | `string` | ❌ | ✅ | `""` | Pod's DNS Config - Option value | +| workload.[workload-name].podSpec.tolerations | `list` | ❌ | ❌ | `{{ .Values.podOptions.tolerations }}` ([]) | Pod's Tolerations | +| workload.[workload-name].podSpec.tolerations.operator | `string` | ✅ | ✅ | | Toleration's `operator` (Equal, Exists) | +| workload.[workload-name].podSpec.tolerations.key | `string` | ❌/✅ | ✅ | | Toleration's `key`. Required only when `operator` = `Equal` | +| workload.[workload-name].podSpec.tolerations.value | `string` | ❌/✅ | ✅ | | Toleration's `value`. Required only when `operator` = `Equal` | +| workload.[workload-name].podSpec.tolerations.effect | `string` | ❌ | ✅ | | Toleration's `effect`.(NoExecute, NoSchedule, PreferNoSchedule) | +| workload.[workload-name].podSpec.tolerations.tolerationSeconds | `int` | ❌ | ❌ | | Toleration's `tolerationSeconds`. | +| workload.[workload-name].podSpec.runtimeClassName | `string` | ❌ | ✅ | `{{ .Values.podOptions.runtimeClassName }}` ("") | Pod's runtimeClassName | +| workload.[workload-name].podSpec.securityContext | `dict` | ❌ | ❌ | `{{ .Values.securityContext.pod }}` | Pod's securityContext | +| workload.[workload-name].podSpec.securityContext.fsGroup | `int` | ❌ | ❌ | `568` | Pod's fsGroup | +| workload.[workload-name].podSpec.securityContext.fsGroupChangePolicy | `string` | ❌ | ❌ | `OnRootMismatch` | Pod's fsGroupChangePolicy (Always, OnRootMismatch) | +| workload.[workload-name].podSpec.securityContext.supplementalGroups | `list` | ❌ | ❌ | `[]` | Pod's supplementalGroups (list of `int`) | +| workload.[workload-name].podSpec.securityContext.sysctls | `list` | ❌ | ❌ | `[]` | Pod's sysctls | +| workload.[workload-name].podSpec.securityContext.sysctls.name | `string` | ✅ | ✅ | `""` | `name` of the sysctl | +| workload.[workload-name].podSpec.securityContext.sysctls.value | `string` | ✅ | ✅ | `""` | `value` of the sysctl | +| workload.[workload-name].podSpec.containers | `dict` | ❌ | ❌ | `{}` | Define container(s) | +| workload.[workload-name].podSpec.initContainers | `dict` | ❌ | ❌ | `{}` | Define initContainer(s) | --- diff --git a/library/common/templates/helpers/_getPortRange.tpl b/library/common/templates/helpers/_getPortRange.tpl index e8e0fd2d8e..0d2254bbf6 100644 --- a/library/common/templates/helpers/_getPortRange.tpl +++ b/library/common/templates/helpers/_getPortRange.tpl @@ -38,15 +38,14 @@ objectData: The object data to be used to render the Pod. {{- $portToCheck := ($portValues.targetPort | default $portValues.port) -}} {{- if kindIs "string" $portToCheck -}} - {{/* Helm stores ints as floats, so convert string to float before comparing */}} - {{- $portToCheck = (tpl $portToCheck $rootCtx) | float64 -}} + {{- $portToCheck = (tpl $portToCheck $rootCtx) | int -}} {{- end -}} - {{- if or (not $portRange.low) (lt $portToCheck ($portRange.low | float64)) -}} + {{- if or (not $portRange.low) (lt ($portToCheck | int) ($portRange.low | int)) -}} {{- $_ := set $portRange "low" $portToCheck -}} {{- end -}} - {{- if or (not $portRange.high) (gt $portToCheck ($portRange.high | float64)) -}} + {{- if or (not $portRange.high) (gt ($portToCheck | int) ($portRange.high | int)) -}} {{- $_ := set $portRange "high" $portToCheck -}} {{- end -}} diff --git a/library/common/templates/lib/container/_fixedEnv.tpl b/library/common/templates/lib/container/_fixedEnv.tpl index 7e20359746..ccbfce56d5 100644 --- a/library/common/templates/lib/container/_fixedEnv.tpl +++ b/library/common/templates/lib/container/_fixedEnv.tpl @@ -49,6 +49,8 @@ objectData: The object data to be used to render the container. {{- $fixed = mustAppend $fixed (dict "k" "UMASK_SET" "v" $UMASK) -}} {{- if eq (include "ix.v1.common.lib.container.resources.gpu" (dict "rootCtx" $rootCtx "objectData" $objectData "returnBool" true)) "true" -}} {{- $fixed = mustAppend $fixed (dict "k" "NVIDIA_DRIVER_CAPABILITIES" "v" (join "," $nvidiaCaps)) -}} + {{- else -}} + {{- $fixed = mustAppend $fixed (dict "k" "NVIDIA_VISIBLE_DEVICES" "v" "void") -}} {{- end -}} {{/* If running as root and PUID is set (0 or greater), set related envs */}} {{- if and (or (eq (int $secContext.runAsUser) 0) (eq (int $secContext.runAsGroup) 0)) (ge (int $PUID) 0) -}} diff --git a/library/common/templates/lib/container/_resources.tpl b/library/common/templates/lib/container/_resources.tpl index 9386478be1..9cd47285e3 100644 --- a/library/common/templates/lib/container/_resources.tpl +++ b/library/common/templates/lib/container/_resources.tpl @@ -75,10 +75,12 @@ objectData: The object data to be used to render the container. {{- if not $returnBool -}} {{- range $gpu := $gpuResource -}} {{- range $k, $v := $gpu -}} - {{- if not $v -}} + {{- if or (kindIs "invalid" $v) (eq (toString $v) "") -}} {{- fail "Container - Expected non-empty " -}} - {{- end }} + {{- end -}} {{/* Dont try to schedule 0 GPUs */}} + {{- if gt (int $v) 0 }} {{ $k }}: {{ $v | quote }} + {{- end -}} {{- end -}} {{- end -}} {{- else -}} diff --git a/library/common/templates/lib/externalInterface/_validation.tpl b/library/common/templates/lib/externalInterface/_validation.tpl index d4f284f624..47b1160ea3 100644 --- a/library/common/templates/lib/externalInterface/_validation.tpl +++ b/library/common/templates/lib/externalInterface/_validation.tpl @@ -28,23 +28,23 @@ objectData: The object data to validate that contains the external interface con {{- fail (printf "External Interface - Expected to be one of [%s], but got [%s]" (join ", " $types) $objectData.ipam.type) -}} {{- end -}} - {{- if and (or $objectData.staticIPConfigurations $objectData.staticRoutes) (ne $objectData.ipam.type "static") -}} - {{- fail "External Interface - Expected empty and when is not [static]" -}} + {{- if and (or $objectData.ipam.staticIPConfigurations $objectData.ipam.staticRoutes) (ne $objectData.ipam.type "static") -}} + {{- fail "External Interface - Expected empty and when is not [static]" -}} {{- end -}} {{- if eq $objectData.ipam.type "static" -}} - {{- if not $objectData.staticIPConfigurations -}} - {{- fail "External Interface - Expected non-empty when is [static]" -}} + {{- if not $objectData.ipam.staticIPConfigurations -}} + {{- fail "External Interface - Expected non-empty when is [static]" -}} {{- end -}} - {{- with $objectData.staticRoutes -}} + {{- with $objectData.ipam.staticRoutes -}} {{- range . -}} {{- if not .destination -}} - {{- fail "External Interface - Expected non-empty in " -}} + {{- fail "External Interface - Expected non-empty in " -}} {{- end -}} {{- if not .gateway -}} - {{- fail "External Interface - Expected non-empty in " -}} + {{- fail "External Interface - Expected non-empty in " -}} {{- end -}} {{- end -}} {{- end -}} diff --git a/library/common/templates/lib/pod/_podSecurityContext.tpl b/library/common/templates/lib/pod/_podSecurityContext.tpl index 589fcc9b24..9204db3413 100644 --- a/library/common/templates/lib/pod/_podSecurityContext.tpl +++ b/library/common/templates/lib/pod/_podSecurityContext.tpl @@ -39,7 +39,7 @@ objectData: The object data to be used to render the Pod. {{- end -}} {{- if $gpuAdded -}} - {{- $_ := set $secContext "supplementalGroups" (concat $secContext.supplementalGroups (list 44)) -}} + {{- $_ := set $secContext "supplementalGroups" (concat $secContext.supplementalGroups (list 44 107)) -}} {{- end -}} {{- $portRange := fromJson (include "ix.v1.common.lib.helpers.securityContext.getPortRange" (dict "rootCtx" $rootCtx "objectData" $objectData)) -}} @@ -47,7 +47,7 @@ objectData: The object data to be used to render the Pod. {{- $_ := set $secContext "sysctls" (mustAppend $secContext.sysctls (dict "name" "net.ipv4.ip_unprivileged_port_start" "value" (printf "%v" $portRange.low))) -}} {{- end -}} - {{- if not $secContext.fsGroup -}} + {{- if or (kindIs "invalid" $secContext.fsGroup) (eq (toString $secContext.fsGroup) "") -}} {{- fail "Pod - Expected non-empty " -}} {{- end -}} diff --git a/library/common/templates/lib/pod/_runtimeClassName.tpl b/library/common/templates/lib/pod/_runtimeClassName.tpl index 2e4a00476e..4eaea16807 100644 --- a/library/common/templates/lib/pod/_runtimeClassName.tpl +++ b/library/common/templates/lib/pod/_runtimeClassName.tpl @@ -25,19 +25,28 @@ objectData: The object data to be used to render the Pod. {{- range $rootCtx.Values.scaleGPU -}} {{- if .gpu -}} {{/* Make sure it has a value... */}} + {{- $gpuAssigned := false -}} - {{- if (kindIs "map" .targetSelector) -}} - {{- range $podName, $containers := .targetSelector -}} - {{- if eq $objectData.shortName $podName -}} {{/* If the pod is selected */}} - {{- $runtime = $rootCtx.Values.global.ixChartContext.nvidiaRuntimeClassName -}} - {{- end -}} + {{- range $k, $v := .gpu -}} + {{- if $v -}} {{/* Consider assigned only if value is not "0" or "" */}} + {{- $gpuAssigned = true -}} {{- end -}} + {{- end -}} - {{- else if $objectData.primary -}} + {{- if $gpuAssigned -}} {{/* If GPU is actually assigned */}} + {{- if (kindIs "map" .targetSelector) -}} + {{- range $podName, $containers := .targetSelector -}} + {{- if eq $objectData.shortName $podName -}} {{/* If the pod is selected */}} + {{- $runtime = $rootCtx.Values.global.ixChartContext.nvidiaRuntimeClassName -}} + {{- end -}} + {{- end -}} - {{/* If the pod is primary and no targetSelector is given, assign to primary */}} - {{- $runtime = $rootCtx.Values.global.ixChartContext.nvidiaRuntimeClassName -}} + {{- else if $objectData.primary -}} + {{/* If the pod is primary and no targetSelector is given, assign to primary */}} + {{- $runtime = $rootCtx.Values.global.ixChartContext.nvidiaRuntimeClassName -}} + + {{- end -}} {{- end -}} {{- end -}} {{- end -}}