integ: gpu-operator chart upgrade 1.6.0 -> 1.8.1

This upgrade is needed in support of A100 GPU, kernel upgrade and bug 1948050. It eliminates the requirement to create nvidia specific runtimeclass prior to installing the charts by pre-installing the toolkit through toolkit- installer subchart. This commit has been tested with the following: driver: 470.57.02 toolkit: 1.7.1-ubi8 defaultRuntime: containerd Test Plan: PASS: Verify gpu-operator starts and adds nvidia.com/gpu to the node. PASS: Verify nvidia-toolkit is removed with helm override of global.toolkit_force_clean=true. PASS: Verify pods can access gpu device and nvidia tools to monitor the GPU. PASS: Verify pod can build and execute cuda sample code. PASS: Verify driver pod prints out warning when building on Low Latency kernel with helm override of: --set driver.env[0].name=IGNORE_PREEMPT_RT_PRESENCE Closes-Bug: 1948050 Signed-off-by: Babak Sarashki <babak.sarashki@windriver.com> Change-Id: I18dd2a0ab1adc6f9364314a22373aadc93cad27f
2021-11-03 22:58:41 +00:00 · 2021-11-03 22:58:41 +00:00 · fabc6822a0
commit fabc6822a0
parent f1955eff93
5 changed files with 673 additions and 343 deletions
--- a/centos_tarball-dl.lst
+++ b/centos_tarball-dl.lst
@ -92,5 +92,5 @@ xxHash-1f40c6511fa8dd9d2e337ca8c9bc18b3e87663c9.tar.gz#xxHash#https://api.github
 zstd-b706286adbba780006a47ef92df0ad7a785666b6.tar.gz#zstd#https://api.github.com/repos/facebook/zstd/tarball/b706286adbba780006a47ef92df0ad7a785666b6#https##
 inih-b1dbff4b0bd1e1f40d237e21011f6dee0ec2fa69.tar.gz#inih-44#https://github.com/benhoyt/inih/tarball/b1dbff4b0bd1e1f40d237e21011f6dee0ec2fa69#https##
 pf-bb-config-d7d5f1ddd17b4c80e3e0d6ce87660926f58f8585.tar.gz#pf-bb-config-21.6#https://github.com/intel/pf-bb-config/tarball/d7d5f1ddd17b4c80e3e0d6ce87660926f58f8585#https##
-gpu-operator-1.6.0.tar.gz#gpu-operator-1.6.0#https://github.com/NVIDIA/gpu-operator/archive/1.6.0.tar.gz##https##
+gpu-operator-1.8.1.tar.gz#gpu-operator-1.8.1#https://github.com/NVIDIA/gpu-operator/archive/v1.8.1.tar.gz##https##
 containernetworking-plugins-v0.9.1.tar.gz#containernetworking-plugins-v0.9.1#https://github.com/containernetworking/plugins/archive/refs/tags/v0.9.1.tar.gz#https##
--- a/gpu/gpu-operator/centos/build_srpm.data
+++ b/gpu/gpu-operator/centos/build_srpm.data
@ -1,4 +1,4 @@
-VERSION=1.6.0
+VERSION=1.8.1
 TAR_NAME=gpu-operator
 TAR="$TAR_NAME-$VERSION.tar.gz"
 COPY_LIST=" \
--- a/gpu/gpu-operator/centos/gpu-operator.spec
+++ b/gpu/gpu-operator/centos/gpu-operator.spec
@ -4,7 +4,7 @@

 Summary: StarlingX nvidia gpu-operator helm chart
 Name: gpu-operator
-Version: 1.6.0
+Version: 1.8.1
 Release: 0%{?_tis_dist}.%{tis_patch_ver}
 License: Apache-2.0
 Group: base
@ -31,11 +31,15 @@ StarlingX port of NVIDIA gpu-operator
 %patch02 -p1

 %build
-cp -r assets deployments/gpu-operator/assets
-
+mkdir -p deployments/gpu-operator/assets/state-driver/
+mkdir -p deployments/gpu-operator/assets/state-operator-validation/
+cp assets/state-driver/0500_daemonset.yaml \
+         deployments/gpu-operator/assets/state-driver/0500_daemonset.yaml
+cp assets/state-operator-validation/0500_daemonset.yaml \
+         deployments/gpu-operator/assets/state-operator-validation/0500_daemonset.yaml
 helm lint deployments/gpu-operator
 mkdir build_results
-helm package --version %{helm_ver}-%{version}.%{tis_patch_ver} --app-version %{version} -d build_results deployments/gpu-operator
+helm package --version %{helm_ver}-%{version}.%{tis_patch_ver} --app-version v%{version} -d build_results deployments/gpu-operator

 %install
 install -d -m 755 ${RPM_BUILD_ROOT}%{helm_folder}
--- a/gpu/gpu-operator/files/deployments-setup-configmap-with-assets-for-volumemo.patch
+++ b/gpu/gpu-operator/files/deployments-setup-configmap-with-assets-for-volumemo.patch
@ -1,4 +1,4 @@
-From b968c69971a195aba4e0c03e8a70df074c128f69 Mon Sep 17 00:00:00 2001
+From 1094b6f1593ec454b3a6313ecf9fae53f8c66899 Mon Sep 17 00:00:00 2001
 From: Babak Sarashki <babak.sarashki@windriver.com>
 Date: Sat, 6 Mar 2021 00:22:40 +0000
 Subject: [PATCH 1/2] deployments: setup configmap with assets for volumemounts
@ -8,17 +8,17 @@ export to the gpu-operator pod through configmap volumeMounts.

 Signed-off-by: Babak Sarashki <babak.sarashki@windriver.com>
 ---
- .../gpu-operator/templates/operator.yaml      | 45 +++++++++++++++++++
+ .../gpu-operator/templates/operator.yaml      | 44 +++++++++++++++++++
 .../templates/operator_configmap.yaml         | 36 +++++++++++++++
 deployments/gpu-operator/values.yaml          |  2 +
- 3 files changed, 83 insertions(+)
+ 3 files changed, 82 insertions(+)
 create mode 100644 deployments/gpu-operator/templates/operator_configmap.yaml

 diff --git a/deployments/gpu-operator/templates/operator.yaml b/deployments/gpu-operator/templates/operator.yaml
-index 50983b20..1dfd9dbc 100644
+index 1d81f74..c97b4b1 100644
 --- a/deployments/gpu-operator/templates/operator.yaml
 +++ b/deployments/gpu-operator/templates/operator.yaml
-@@ -50,6 +50,45 @@ spec:
+@@ -49,6 +49,44 @@ spec:
           - name: host-os-release
             mountPath: "/host-etc/os-release"
             readOnly: true
@ -60,11 +60,10 @@ index 50983b20..1dfd9dbc 100644
 +            subPath: {{ printf "state_monitor_%s" (base $path) }}
 +          {{- end }}
 +          {{- end }}
-+ 
-         readinessProbe:
-           exec:
-             command: ["stat", "/tmp/operator-sdk-ready"]
-@@ -63,6 +102,12 @@ spec:
+         livenessProbe:
+           httpGet:
+             path: /healthz
+@@ -72,6 +110,12 @@ spec:
         - name: host-os-release
           hostPath:
             path: "/etc/os-release"
@ -79,7 +78,7 @@ index 50983b20..1dfd9dbc 100644
         {{- toYaml . | nindent 8 }}
 diff --git a/deployments/gpu-operator/templates/operator_configmap.yaml b/deployments/gpu-operator/templates/operator_configmap.yaml
 new file mode 100644
-index 00000000..61f366e8
+index 0000000..61f366e
 --- /dev/null
 +++ b/deployments/gpu-operator/templates/operator_configmap.yaml
@@ -0,0 +1,36 @@
@ -120,18 +119,18 @@ index 00000000..61f366e8
 +{{- end }}
 +{{- end }}
 diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml
-index 00d94195..8b43c59f 100644
+index 78a4757..6689636 100644
 --- a/deployments/gpu-operator/values.yaml
 +++ b/deployments/gpu-operator/values.yaml
-@@ -39,6 +39,8 @@ operator:
+@@ -70,6 +70,8 @@ operator:
                 values: [""]
   logging:
     timeEncoding: epoch
-+  # Set to "include_assets" to include assets/gpu-operator with the helm chart
+  # Set "include_assets" true to include assets/gpu-operator with the helm chart
 +  include_assets: ""
- 
- driver:
-   repository: nvcr.io/nvidia
+   resources:
+     limits:
+       cpu: 500m
 -- 
 2.17.1

--- a/gpu/gpu-operator/files/enablement-support-on-starlingx-cloud-platform.patch
+++ b/gpu/gpu-operator/files/enablement-support-on-starlingx-cloud-platform.patch