integ: gpu-operator chart upgrade 1.6.0 -> 1.8.1

This upgrade is needed in support of A100 GPU, kernel
upgrade and bug 1948050. It eliminates the requirement
to create nvidia specific runtimeclass prior to installing
the charts by pre-installing the toolkit through toolkit-
installer subchart.

This commit has been tested with the following:

driver: 470.57.02
toolkit: 1.7.1-ubi8
defaultRuntime: containerd

Test Plan:
PASS: Verify gpu-operator starts and adds nvidia.com/gpu
      to the node.
PASS: Verify nvidia-toolkit is removed with helm override
      of global.toolkit_force_clean=true.
PASS: Verify pods can access gpu device and nvidia tools
      to monitor the GPU.
PASS: Verify pod can build and execute cuda sample code.
PASS: Verify driver pod prints out warning when building
      on Low Latency kernel with helm override of:
	  --set driver.env[0].name=IGNORE_PREEMPT_RT_PRESENCE

Closes-Bug: 1948050
Signed-off-by: Babak Sarashki <babak.sarashki@windriver.com>
Change-Id: I18dd2a0ab1adc6f9364314a22373aadc93cad27f
This commit is contained in:
Babak Sarashki 2021-11-03 22:58:41 +00:00
parent f1955eff93
commit fabc6822a0
5 changed files with 673 additions and 343 deletions

View File

@ -92,5 +92,5 @@ xxHash-1f40c6511fa8dd9d2e337ca8c9bc18b3e87663c9.tar.gz#xxHash#https://api.github
zstd-b706286adbba780006a47ef92df0ad7a785666b6.tar.gz#zstd#https://api.github.com/repos/facebook/zstd/tarball/b706286adbba780006a47ef92df0ad7a785666b6#https##
inih-b1dbff4b0bd1e1f40d237e21011f6dee0ec2fa69.tar.gz#inih-44#https://github.com/benhoyt/inih/tarball/b1dbff4b0bd1e1f40d237e21011f6dee0ec2fa69#https##
pf-bb-config-d7d5f1ddd17b4c80e3e0d6ce87660926f58f8585.tar.gz#pf-bb-config-21.6#https://github.com/intel/pf-bb-config/tarball/d7d5f1ddd17b4c80e3e0d6ce87660926f58f8585#https##
gpu-operator-1.6.0.tar.gz#gpu-operator-1.6.0#https://github.com/NVIDIA/gpu-operator/archive/1.6.0.tar.gz##https##
gpu-operator-1.8.1.tar.gz#gpu-operator-1.8.1#https://github.com/NVIDIA/gpu-operator/archive/v1.8.1.tar.gz##https##
containernetworking-plugins-v0.9.1.tar.gz#containernetworking-plugins-v0.9.1#https://github.com/containernetworking/plugins/archive/refs/tags/v0.9.1.tar.gz#https##

View File

@ -1,4 +1,4 @@
VERSION=1.6.0
VERSION=1.8.1
TAR_NAME=gpu-operator
TAR="$TAR_NAME-$VERSION.tar.gz"
COPY_LIST=" \

View File

@ -4,7 +4,7 @@
Summary: StarlingX nvidia gpu-operator helm chart
Name: gpu-operator
Version: 1.6.0
Version: 1.8.1
Release: 0%{?_tis_dist}.%{tis_patch_ver}
License: Apache-2.0
Group: base
@ -31,11 +31,15 @@ StarlingX port of NVIDIA gpu-operator
%patch02 -p1
%build
cp -r assets deployments/gpu-operator/assets
mkdir -p deployments/gpu-operator/assets/state-driver/
mkdir -p deployments/gpu-operator/assets/state-operator-validation/
cp assets/state-driver/0500_daemonset.yaml \
deployments/gpu-operator/assets/state-driver/0500_daemonset.yaml
cp assets/state-operator-validation/0500_daemonset.yaml \
deployments/gpu-operator/assets/state-operator-validation/0500_daemonset.yaml
helm lint deployments/gpu-operator
mkdir build_results
helm package --version %{helm_ver}-%{version}.%{tis_patch_ver} --app-version %{version} -d build_results deployments/gpu-operator
helm package --version %{helm_ver}-%{version}.%{tis_patch_ver} --app-version v%{version} -d build_results deployments/gpu-operator
%install
install -d -m 755 ${RPM_BUILD_ROOT}%{helm_folder}

View File

@ -1,4 +1,4 @@
From b968c69971a195aba4e0c03e8a70df074c128f69 Mon Sep 17 00:00:00 2001
From 1094b6f1593ec454b3a6313ecf9fae53f8c66899 Mon Sep 17 00:00:00 2001
From: Babak Sarashki <babak.sarashki@windriver.com>
Date: Sat, 6 Mar 2021 00:22:40 +0000
Subject: [PATCH 1/2] deployments: setup configmap with assets for volumemounts
@ -8,17 +8,17 @@ export to the gpu-operator pod through configmap volumeMounts.
Signed-off-by: Babak Sarashki <babak.sarashki@windriver.com>
---
.../gpu-operator/templates/operator.yaml | 45 +++++++++++++++++++
.../gpu-operator/templates/operator.yaml | 44 +++++++++++++++++++
.../templates/operator_configmap.yaml | 36 +++++++++++++++
deployments/gpu-operator/values.yaml | 2 +
3 files changed, 83 insertions(+)
3 files changed, 82 insertions(+)
create mode 100644 deployments/gpu-operator/templates/operator_configmap.yaml
diff --git a/deployments/gpu-operator/templates/operator.yaml b/deployments/gpu-operator/templates/operator.yaml
index 50983b20..1dfd9dbc 100644
index 1d81f74..c97b4b1 100644
--- a/deployments/gpu-operator/templates/operator.yaml
+++ b/deployments/gpu-operator/templates/operator.yaml
@@ -50,6 +50,45 @@ spec:
@@ -49,6 +49,44 @@ spec:
- name: host-os-release
mountPath: "/host-etc/os-release"
readOnly: true
@ -60,11 +60,10 @@ index 50983b20..1dfd9dbc 100644
+ subPath: {{ printf "state_monitor_%s" (base $path) }}
+ {{- end }}
+ {{- end }}
+
readinessProbe:
exec:
command: ["stat", "/tmp/operator-sdk-ready"]
@@ -63,6 +102,12 @@ spec:
livenessProbe:
httpGet:
path: /healthz
@@ -72,6 +110,12 @@ spec:
- name: host-os-release
hostPath:
path: "/etc/os-release"
@ -79,7 +78,7 @@ index 50983b20..1dfd9dbc 100644
{{- toYaml . | nindent 8 }}
diff --git a/deployments/gpu-operator/templates/operator_configmap.yaml b/deployments/gpu-operator/templates/operator_configmap.yaml
new file mode 100644
index 00000000..61f366e8
index 0000000..61f366e
--- /dev/null
+++ b/deployments/gpu-operator/templates/operator_configmap.yaml
@@ -0,0 +1,36 @@
@ -120,18 +119,18 @@ index 00000000..61f366e8
+{{- end }}
+{{- end }}
diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml
index 00d94195..8b43c59f 100644
index 78a4757..6689636 100644
--- a/deployments/gpu-operator/values.yaml
+++ b/deployments/gpu-operator/values.yaml
@@ -39,6 +39,8 @@ operator:
@@ -70,6 +70,8 @@ operator:
values: [""]
logging:
timeEncoding: epoch
+ # Set to "include_assets" to include assets/gpu-operator with the helm chart
+ # Set "include_assets" true to include assets/gpu-operator with the helm chart
+ include_assets: ""
driver:
repository: nvcr.io/nvidia
resources:
limits:
cpu: 500m
--
2.17.1