Debian: Add package gpu-operator

This change adds the gpu-operator package to the Debian build. The
NVIDIA GPU Operator uses the operator framework within Kubernetes to
automate the management of all NVIDIA software components needed to
provision GPU.

The provided patches come from the CentOS port done in
https://review.opendev.org/c/starlingx/integ/+/784144
https://review.opendev.org/c/starlingx/integ/+/817725

Test plan (Debian only)
PASS  build ISO with the package installed
PASS  execute helm install
PASS  execute helm uninstall

Story: 2009968
Task: 45976

Signed-off-by: Andre Fernando Zanella Kantek <AndreFernandoZanella.Kantek@windriver.com>
Change-Id: Ic656d764dc3e31dcd89e02b172c14eb6d32743a7
This commit is contained in:
Andre Fernando Zanella Kantek 2022-07-27 10:56:43 -03:00
parent 1c467064a8
commit fd5d9e694b
10 changed files with 1089 additions and 0 deletions

View File

@ -45,6 +45,7 @@ golang-github-dev/golang-github-cilium-ebpf-dev
golang-github-dev/golang-github-coreos-go-systemd-dev
golang-github-dev/golang-github-opencontainers-specs-dev
golang-github-dev/golang-github-vishvananda-netlink
gpu/gpu-operator
grub/grub2
grub/grubby
kubernetes/armada

View File

@ -0,0 +1,5 @@
gpu-operator (1.8.1) unstable; urgency=medium
* Initial release.
-- Andre Kantek <andrefernandozanella.kantek@windriver.com> Thu, 27 Jul 2022 14:00:42 +0000

View File

@ -0,0 +1,14 @@
Source: gpu-operator
Section: admin
Priority: optional
Maintainer: StarlingX Developers <starlingx-discuss@lists.starlingx.io>
Build-Depends: debhelper-compat (= 13), helm
Standards-Version: 4.5.1
Homepage: https://www.starlingx.io
Package: gpu-operator
Architecture: any
Depends: ${misc:Depends}, ${shlibs:Depends}
Description: The NVIDIA GPU Operator uses the operator framework within
Kubernetes to automate the management of all NVIDIA software components
needed to provision GPU

View File

@ -0,0 +1,29 @@
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Contact: https://github.com/NVIDIA/gpu-operator/
Source: https://github.com/NVIDIA/gpu-operator/
Files: *
Copyright: (C) 2018-2022 https://github.com/NVIDIA/gpu-operator/
License: Apache-2
Upstream-Name: gpu-operator
Upstream-Contact: StarlingX Developers <starlingx-discuss@lists.starlingx.io>
Source: https://opendev.org/starlingx/integ/src/branch/master/gpu/gpu-operator/
Files: debian/*
Copyright: (c) 2022 Wind River Systems, Inc.
License: Apache-2
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
.
http://www.apache.org/licenses/LICENSE-2.0
.
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
.
On Debian-based systems the full text of the Apache version 2.0 license
can be found in `/usr/share/common-licenses/Apache-2.0'.

View File

@ -0,0 +1 @@
opt/extracharts/gpu-operator-v3-1.8.1.tgz

View File

@ -0,0 +1,23 @@
#!/usr/bin/make -f
export HELM_VER = v3
export PKG_VERSION = 1.8.1
export DEBIAN_DESTDIR := $(CURDIR)/debian/tmp
%:
dh $@
override_dh_auto_build:
mkdir -p deployments/gpu-operator/assets/state-driver/
mkdir -p deployments/gpu-operator/assets/state-operator-validation/
cp assets/state-driver/0500_daemonset.yaml deployments/gpu-operator/assets/state-driver/0500_daemonset.yaml
cp assets/state-operator-validation/0500_daemonset.yaml deployments/gpu-operator/assets/state-operator-validation/0500_daemonset.yaml
helm lint deployments/gpu-operator
mkdir build_results
helm package --version ${HELM_VER}-${PKG_VERSION} --app-version v${PKG_VERSION} -d build_results deployments/gpu-operator
override_dh_auto_install:
# Install the app tar file.
install -d -m 755 ${DEBIAN_DESTDIR}/opt/extracharts
install -p -D -m 644 build_results/gpu-operator-${HELM_VER}-${PKG_VERSION}.tgz ${DEBIAN_DESTDIR}/opt/extracharts
dh_install

View File

@ -0,0 +1,11 @@
---
debname: gpu-operator
debver: 1.8.1
dl_path:
name: gpu-operator-v1.8.1.tar.gz
url: https://github.com/NVIDIA/gpu-operator/archive/refs/tags/v1.8.1.tar.gz
md5sum: 03c7346c724774ecd63d33ba7d8e110a
sha256sum: 42e08c95ce5b558a296cb31c98a6beeef3b551d47d236fa082db7fa5c44ad471
revision:
dist: $STX_DIST
PKG_GITREVCOUNT: true

View File

@ -0,0 +1,136 @@
From 1094b6f1593ec454b3a6313ecf9fae53f8c66899 Mon Sep 17 00:00:00 2001
From: Babak Sarashki <babak.sarashki@windriver.com>
Date: Sat, 6 Mar 2021 00:22:40 +0000
Subject: [PATCH 1/2] deployments: setup configmap with assets for volumemounts
This feature allows inclusion of assets/ in the helm chart and their
export to the gpu-operator pod through configmap volumeMounts.
Signed-off-by: Babak Sarashki <babak.sarashki@windriver.com>
---
.../gpu-operator/templates/operator.yaml | 44 +++++++++++++++++++
.../templates/operator_configmap.yaml | 36 +++++++++++++++
deployments/gpu-operator/values.yaml | 2 +
3 files changed, 82 insertions(+)
create mode 100644 deployments/gpu-operator/templates/operator_configmap.yaml
diff --git a/deployments/gpu-operator/templates/operator.yaml b/deployments/gpu-operator/templates/operator.yaml
index 1d81f74..c97b4b1 100644
--- a/deployments/gpu-operator/templates/operator.yaml
+++ b/deployments/gpu-operator/templates/operator.yaml
@@ -49,6 +49,44 @@ spec:
- name: host-os-release
mountPath: "/host-etc/os-release"
readOnly: true
+
+ {{- if eq .Values.operator.include_assets "include_assets" }}
+ {{- range $path, $_ := .Files.Glob "assets/gpu-feature-discovery/*" }}
+ - name: assets
+ mountPath: {{ printf "/opt/gpu-operator/gpu-feature-discovery/%s" (base $path) }}
+ subPath: {{ printf "gfd_%s" (base $path) }}
+ {{- end }}
+
+ {{- range $path, $_ := .Files.Glob "assets/state-container-toolkit/*" }}
+ - name: assets
+ mountPath: {{ printf "/opt/gpu-operator/state-container-toolkit/%s" (base $path) }}
+ subPath: {{ printf "state_container_toolkit_%s" (base $path) }}
+ {{- end }}
+
+ {{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }}
+ - name: assets
+ mountPath: {{ printf "/opt/gpu-operator/state-device-plugin/%s" (base $path) }}
+ subPath: {{ printf "state_device_%s" (base $path) }}
+ {{- end }}
+
+ {{- range $path, $_ := .Files.Glob "assets/state-device-plugin-validation/*" }}
+ - name: assets
+ mountPath: {{ printf "/opt/gpu-operator/state-device-plugin-validation/%s" (base $path) }}
+ subPath: {{ printf "state_device_validation_%s" (base $path) }}
+ {{- end }}
+
+ {{- range $path, $_ := .Files.Glob "assets/state-driver/*" }}
+ - name: assets
+ mountPath: {{ printf "/opt/gpu-operator/state-driver/%s" (base $path) }}
+ subPath: {{ printf "state_driver_%s" (base $path) }}
+ {{- end }}
+
+ {{- range $path, $_ := .Files.Glob "assets/state-monitoring/*" }}
+ - name: assets
+ mountPath: {{ printf "/opt/gpu-operator/state-monitoring/%s" (base $path) }}
+ subPath: {{ printf "state_monitor_%s" (base $path) }}
+ {{- end }}
+ {{- end }}
livenessProbe:
httpGet:
path: /healthz
@@ -72,6 +110,12 @@ spec:
- name: host-os-release
hostPath:
path: "/etc/os-release"
+ {{- if eq .Values.operator.include_assets "include_assets" }}
+ - name: assets
+ configMap:
+ name: operator-configmap
+ {{- end }}
+
{{- with .Values.operator.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
diff --git a/deployments/gpu-operator/templates/operator_configmap.yaml b/deployments/gpu-operator/templates/operator_configmap.yaml
new file mode 100644
index 0000000..61f366e
--- /dev/null
+++ b/deployments/gpu-operator/templates/operator_configmap.yaml
@@ -0,0 +1,36 @@
+{{- if eq .Values.operator.include_assets "include_assets" }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: operator-configmap
+data:
+{{- range $path, $_ := .Files.Glob "assets/gpu-feature-discovery/*" }}
+{{ printf "gfd_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+
+{{- range $path, $_ := .Files.Glob "assets/state-container-toolkit/*" }}
+{{ printf "state_container_toolkit_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+
+{{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }}
+{{ printf "state_device_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+
+{{- range $path, $_ := .Files.Glob "assets/state-device-plugin-validation/*" }}
+{{ printf "state_device_validation_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+
+{{- range $path, $_ := .Files.Glob "assets/state-driver/*" }}
+{{ printf "state_driver_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+
+{{- range $path, $_ := .Files.Glob "assets/state-monitoring/*" }}
+{{ printf "state_monitor_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+{{- end }}
diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml
index 78a4757..6689636 100644
--- a/deployments/gpu-operator/values.yaml
+++ b/deployments/gpu-operator/values.yaml
@@ -70,6 +70,8 @@ operator:
values: [""]
logging:
timeEncoding: epoch
+ # Set "include_assets" true to include assets/gpu-operator with the helm chart
+ include_assets: ""
resources:
limits:
cpu: 500m
--
2.17.1

View File

@ -0,0 +1,867 @@
From 65ac63ca1bc8517f3f0c3560498de758149a3800 Mon Sep 17 00:00:00 2001
From: Babak Sarashki <babak.sarashki@windriver.com>
Date: Sun, 7 Mar 2021 17:19:08 +0000
Subject: [PATCH] enablement: support on starlingx cloud platform
StarlingX is a cloud infrastructure software stack for edge.
It has an immutable file system, and system configruation. For
instance changes to set containerd runtime by the gpu-operator
will be overriden and must be avoided.
This commit enables gpu-operator on Starlingx (starlingx.io).
The changes to the gpu-operator include bundling modified assets
and a modified version of the nvidia-driver build script with the
helm charts.
The modifications include host-mounting the kernel headers and
kernel build directory onto the respective mount points inside
the driver pod namespace; modifying the nvidia-driver to account
for pre-installed kernel packages; and pre-installing the nvidia-
toolkit version 1.7.1-ubi8. The defaultRuntime is expected to
be containerd.
To load the operator on starlingx:
$ source /etc/platform/openrc
[...(keystone_admin)]$ system service-parameter-add \
platform container_runtime \
custom_container_runtime=nvidia:/path/to/nvidia-container-runtime
[...(keystone_admin)]$ system host-lock 1; system host-unlock 1
Signed-off-by: Babak Sarashki <babak.sarashki@windriver.com>
---
assets/state-driver/0500_daemonset.yaml | 47 ++-
.../0500_daemonset.yaml | 18 ++
deployments/gpu-operator/Chart.yaml | 3 +
.../charts/stx-toolkit-installer/.helmignore | 23 ++
.../charts/stx-toolkit-installer/Chart.yaml | 6 +
.../templates/_helpers.tpl | 6 +
.../templates/toolkit.yaml | 71 +++++
.../charts/stx-toolkit-installer/values.yaml | 8 +
.../templates/build_configmap.yaml | 291 ++++++++++++++++++
.../gpu-operator/templates/clusterpolicy.yaml | 4 +-
.../gpu-operator/templates/operator.yaml | 52 +++-
.../templates/operator_confimap.yaml | 61 ++++
deployments/gpu-operator/values.yaml | 15 +-
13 files changed, 583 insertions(+), 22 deletions(-)
create mode 100644 deployments/gpu-operator/charts/stx-toolkit-installer/.helmignore
create mode 100644 deployments/gpu-operator/charts/stx-toolkit-installer/Chart.yaml
create mode 100644 deployments/gpu-operator/charts/stx-toolkit-installer/templates/_helpers.tpl
create mode 100644 deployments/gpu-operator/charts/stx-toolkit-installer/templates/toolkit.yaml
create mode 100644 deployments/gpu-operator/charts/stx-toolkit-installer/values.yaml
create mode 100644 deployments/gpu-operator/templates/build_configmap.yaml
create mode 100644 deployments/gpu-operator/templates/operator_confimap.yaml
diff --git a/assets/state-driver/0500_daemonset.yaml b/assets/state-driver/0500_daemonset.yaml
index 4cd1617..c8aefd2 100644
--- a/assets/state-driver/0500_daemonset.yaml
+++ b/assets/state-driver/0500_daemonset.yaml
@@ -35,7 +35,6 @@ spec:
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- # always use runc for driver containers
- name: NVIDIA_VISIBLE_DEVICES
value: void
securityContext:
@@ -72,8 +71,14 @@ spec:
- image: "FILLED BY THE OPERATOR"
imagePullPolicy: IfNotPresent
name: nvidia-driver-ctr
- command: ["nvidia-driver"]
- args: ["init"]
+ command: ["/bin/bash"]
+ args:
+ - "-c"
+ - "--"
+ - >
+ tar -C /usr/host-include -c . -f - | tar -C /usr/include -xvf -;
+ ln -rfs /usr/lib64/libelf.so.1 /usr/lib/libelf.so;
+ /usr/local/bin/nvidia-driver init;
securityContext:
privileged: true
seLinuxOptions:
@@ -94,6 +99,22 @@ spec:
- name: run-mellanox-drivers
mountPath: /run/mellanox/drivers
mountPropagation: HostToContainer
+ - name: host-modules
+ mountPath: /lib/modules
+ readOnly: false
+ - name: host-include
+ mountPath: /usr/host-include
+ readOnly: false
+ - name: host-kernel-devel
+ mountPath: /usr/src/kernels
+ readOnly: true
+ - name: host-usr-src
+ mountPath: /usr/host-src
+ readOnly: false
+ - name: vol11
+ mountPath: /usr/local/bin/nvidia-driver
+ subPath: nvidia-driver-build-script
+ readOnly: true
- image: "FILLED BY THE OPERATOR"
imagePullPolicy: IfNotPresent
name: nvidia-peermem-ctr
@@ -157,4 +178,22 @@ spec:
hostPath:
path: /run/nvidia/validations
type: DirectoryOrCreate
-
+ - name: host-modules
+ hostPath:
+ path: /lib/modules
+ - name: host-kernel-devel
+ hostPath:
+ path: /usr/src/kernels/
+ - name: host-include
+ hostPath:
+ path: /usr/include
+ - name: host-usr-src
+ hostPath:
+ path: /usr/src
+ - name: vol11
+ configMap:
+ name: nvidia-driver
+ defaultMode: 0777
+ items:
+ - key: nvidia-driver-build-script
+ path: nvidia-driver-build-script
diff --git a/assets/state-operator-validation/0500_daemonset.yaml b/assets/state-operator-validation/0500_daemonset.yaml
index 266c9d6..ce226fa 100644
--- a/assets/state-operator-validation/0500_daemonset.yaml
+++ b/assets/state-operator-validation/0500_daemonset.yaml
@@ -75,6 +75,10 @@ spec:
- name: run-nvidia-validations
mountPath: /run/nvidia/validations
mountPropagation: Bidirectional
+ - name: vol12
+ mountPath: /var/nvidia/manifests/cuda-workload-validation.yaml
+ subPath: cuda-workload-validation.yaml
+ readOnly: true
- name: plugin-validation
image: "FILLED_BY_OPERATOR"
command: ['sh', '-c']
@@ -98,6 +102,10 @@ spec:
- name: run-nvidia-validations
mountPath: /run/nvidia/validations
mountPropagation: Bidirectional
+ - name: vol12
+ mountPath: /var/nvidia/manifests/plugin-workload-validation.yaml
+ subPath: plugin-workload-validation.yaml
+ readOnly: true
containers:
- image: "FILLED_BY_OPERATOR"
name: nvidia-operator-validator
@@ -113,6 +121,7 @@ spec:
- name: run-nvidia-validations
mountPath: "/run/nvidia/validations"
mountPropagation: Bidirectional
+ terminationGracePeriodSeconds: 60
volumes:
- name: run-nvidia-validations
hostPath:
@@ -121,3 +130,12 @@ spec:
- name: driver-install-path
hostPath:
path: /run/nvidia/driver
+ - name: vol12
+ configMap:
+ name: nvidia-validator
+ defaultMode: 0444
+ items:
+ - key: cuda-workload-validation.yaml
+ path: cuda-workload-validation.yaml
+ - key: plugin-workload-validation.yaml
+ path: plugin-workload-validation.yaml
diff --git a/deployments/gpu-operator/Chart.yaml b/deployments/gpu-operator/Chart.yaml
index 0b379a3..7b743e4 100644
--- a/deployments/gpu-operator/Chart.yaml
+++ b/deployments/gpu-operator/Chart.yaml
@@ -22,3 +22,6 @@ dependencies:
version: 0.8.2
repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts
condition: nfd.enabled
+ - name: stx-toolkit-installer
+ version: 0.1.0
+ condition: toolkit-installer.enabled
diff --git a/deployments/gpu-operator/charts/stx-toolkit-installer/.helmignore b/deployments/gpu-operator/charts/stx-toolkit-installer/.helmignore
new file mode 100644
index 0000000..0e8a0eb
--- /dev/null
+++ b/deployments/gpu-operator/charts/stx-toolkit-installer/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/deployments/gpu-operator/charts/stx-toolkit-installer/Chart.yaml b/deployments/gpu-operator/charts/stx-toolkit-installer/Chart.yaml
new file mode 100644
index 0000000..c195c58
--- /dev/null
+++ b/deployments/gpu-operator/charts/stx-toolkit-installer/Chart.yaml
@@ -0,0 +1,6 @@
+apiVersion: v2
+appVersion: v0.1.0
+name: stx-toolkit-installer
+description: "Standalone nvidia toolkit installer for starlingx"
+type: application
+version: 1.7.1-ubi8
diff --git a/deployments/gpu-operator/charts/stx-toolkit-installer/templates/_helpers.tpl b/deployments/gpu-operator/charts/stx-toolkit-installer/templates/_helpers.tpl
new file mode 100644
index 0000000..b6f6274
--- /dev/null
+++ b/deployments/gpu-operator/charts/stx-toolkit-installer/templates/_helpers.tpl
@@ -0,0 +1,6 @@
+{{/*
+Full image name with tag
+*/}}
+{{- define "toolkit-installer.fullimage" -}}
+{{- .Values.toolkit.repository -}}/{{- .Values.toolkit.image -}}:{{- .Values.toolkit.version | default .Chart.AppVersion -}}
+{{- end }}
diff --git a/deployments/gpu-operator/charts/stx-toolkit-installer/templates/toolkit.yaml b/deployments/gpu-operator/charts/stx-toolkit-installer/templates/toolkit.yaml
new file mode 100644
index 0000000..3cbec11
--- /dev/null
+++ b/deployments/gpu-operator/charts/stx-toolkit-installer/templates/toolkit.yaml
@@ -0,0 +1,71 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+ name: toolkit-installer
+ namespace: kube-system
+ labels:
+ app.kubernetes.io/component: "toolkit-installer"
+ {{ $.Release.labels }}
+spec:
+ selector:
+ matchLabels:
+ {{ $.Release.labels }}
+ app.kubernetes.io/component: "toolkit-installer"
+ app: "toolkit-installer"
+ template:
+ metadata:
+ labels:
+ {{ $.Release.labels }}
+ app.kubernetes.io/component: "toolkit-installer"
+ app: "toolkit-installer"
+ spec:
+ containers:
+ - name: toolkit-daemon
+ image: {{ include "toolkit-installer.fullimage" . }}
+ lifecycle:
+ preStop:
+ exec:
+ command:
+ - "/bin/sh"
+ - "-c"
+ - "--"
+ - >
+ if [ $toolkit_force_clean == "true" ] ; then
+ while [[ -f /var/run/nvidia/validations/cuda-ready ]] ||
+ [[ -f /var/run/nvidia/validations/driver-ready ]] ||
+ [[ -f /var/run/nvidia/validations/plugin-ready ]] ||
+ [[ -f /var/run/nvidia/validations/toolkit-ready ]] ;
+ do
+ echo "waiting for gpu pods to exit"
+ sleep 10;
+ done;
+ sleep 60;
+ rm -rf /usr/local/nvidia/toolkit;
+ fi;
+ command: ["/bin/bash"]
+ args:
+ - "-c"
+ - "--"
+ - >
+ ./toolkit install /usr/local/nvidia/toolkit;
+ sleep infinity;
+ env:
+ - name: toolkit_force_clean
+ value: {{ quote .Values.global.toolkit_force_clean }}
+ volumeMounts:
+ - name: toolkitdest
+ mountPath: /usr/local/nvidia
+ readOnly: false
+ - name: varrunnvidia
+ mountPath: /var/run/nvidia
+ readOnly: true
+ {{ if (.Values.global.toolkit_force_clean) and (eq .Values.gobal.toolkit_force_clean "true") }}
+ terminationGracePeriodSeconds: 120
+ {{- end }}
+ volumes:
+ - name: toolkitdest
+ hostPath:
+ path: /usr/local/nvidia
+ - name: varrunnvidia
+ hostPath:
+ path: /var/run/nvidia
diff --git a/deployments/gpu-operator/charts/stx-toolkit-installer/values.yaml b/deployments/gpu-operator/charts/stx-toolkit-installer/values.yaml
new file mode 100644
index 0000000..b898dc2
--- /dev/null
+++ b/deployments/gpu-operator/charts/stx-toolkit-installer/values.yaml
@@ -0,0 +1,8 @@
+toolkit:
+ repository: nvidia
+ image: container-toolkit
+ version: 1.7.1-ubi8
+ imagePullPolicy: IfNotPresent
+ imagePullSecrets: []
+ priorityClassName: system-node-critical
+ defaultRuntime: containerd
diff --git a/deployments/gpu-operator/templates/build_configmap.yaml b/deployments/gpu-operator/templates/build_configmap.yaml
new file mode 100644
index 0000000..a7453a4
--- /dev/null
+++ b/deployments/gpu-operator/templates/build_configmap.yaml
@@ -0,0 +1,291 @@
+{{ if (.Values.operator.include_assets) and (eq .Values.operator.include_assets "True") }}
+apiVersion: v1
+kind: Namespace
+metadata:
+ name: "gpu-operator-resources"
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: nvidia-driver
+ namespace: gpu-operator-resources
+data:
+ nvidia-driver-build-script: |
+ #! /bin/bash
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ # Copyright (c) 2021 Wind River Systems, Inc. SPDX-License-Identifier:
+ # Apache-2.0.
+ # This script is from: https://gitlab.com/nvidia/container-images/driver.
+ # It is modified and included under configmap for platforms that require
+ # pre-installed packages. Such platforms have the option to modify the
+ # entrypoint in 0500_daemonset.yaml, or the nvidia-driver script here for
+ # further customizations.
+
+ set -eu
+
+ RUN_DIR=/run/nvidia
+ PID_FILE=${RUN_DIR}/${0##*/}.pid
+ DRIVER_VERSION=${DRIVER_VERSION:?"Missing driver version"}
+ KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver
+ KERNEL_VERSION="$(uname -r)"
+
+ _install_tools() {
+ yum clean all
+ yum install -y centos-release-scl
+ yum install -y epel-release
+ yum install -y devtoolset-8-build devtoolset-8-binutils devtoolset-8-gcc devtoolset-8-make
+ }
+
+ # Load the kernel modules and start persistenced.
+ _load_driver() {
+ echo "Loading IPMI kernel module..."
+ modprobe ipmi_msghandler
+
+ echo "Loading NVIDIA driver kernel modules..."
+ modprobe -a nvidia nvidia-uvm nvidia-modeset
+
+ echo "Starting NVIDIA persistence daemon..."
+ nvidia-persistenced --persistence-mode
+ }
+
+ # Stop persistenced and unload the kernel modules if they are currently loaded.
+ _unload_driver() {
+ local rmmod_args=()
+ local nvidia_deps=0
+ local nvidia_refs=0
+ local nvidia_uvm_refs=0
+ local nvidia_modeset_refs=0
+
+ echo "Stopping NVIDIA persistence daemon..."
+ if [ -f /var/run/nvidia-persistenced/nvidia-persistenced.pid ]; then
+ local pid=$(< /var/run/nvidia-persistenced/nvidia-persistenced.pid)
+
+ kill -SIGTERM "${pid}"
+ for i in $(seq 1 10); do
+ kill -0 "${pid}" 2> /dev/null || break
+ sleep 0.1
+ done
+ if [ $i -eq 10 ]; then
+ echo "Could not stop NVIDIA persistence daemon" >&2
+ return 1
+ fi
+ fi
+
+ echo "Unloading NVIDIA driver kernel modules..."
+ if [ -f /sys/module/nvidia_modeset/refcnt ]; then
+ nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt)
+ rmmod_args+=("nvidia-modeset")
+ ((++nvidia_deps))
+ fi
+ if [ -f /sys/module/nvidia_uvm/refcnt ]; then
+ nvidia_uvm_refs=$(< /sys/module/nvidia_uvm/refcnt)
+ rmmod_args+=("nvidia-uvm")
+ ((++nvidia_deps))
+ fi
+ if [ -f /sys/module/nvidia/refcnt ]; then
+ nvidia_refs=$(< /sys/module/nvidia/refcnt)
+ rmmod_args+=("nvidia")
+ fi
+ if [ ${nvidia_refs} -gt ${nvidia_deps} ] || [ ${nvidia_uvm_refs} -gt 0 ] || [ ${nvidia_modeset_refs} -gt 0 ]; then
+ echo "Could not unload NVIDIA driver kernel modules, driver is in use" >&2
+ return 1
+ fi
+
+ if [ ${#rmmod_args[@]} -gt 0 ]; then
+ rmmod ${rmmod_args[@]}
+ fi
+ return 0
+ }
+
+ # Link and install the kernel modules from a precompiled package using the nvidia-installer.
+ _install_driver() {
+ local install_args=()
+
+ # Default is standard kernel.
+ if [ ! -z ${IGNORE_PREEMPT_RT_PRESENCE+x} ] ; then
+ echo "WARN: IGNORE_PREEMPT_RT_PRESENCE set"
+ echo "Build Target PREEMPT_RT best effort"
+ fi;
+
+ _install_tools
+ export PATH=/opt/rh/devtoolset-8/root/usr/bin${PATH:+:${PATH}}
+ export PCP_DIR=/opt/rh/devtoolset-8/root
+
+ echo "Installing NVIDIA driver kernel modules..."
+ cd /usr/src/nvidia-${DRIVER_VERSION}
+ # rm -rf /lib/modules/${KERNEL_VERSION}/video
+
+ if [ "${ACCEPT_LICENSE}" = "yes" ]; then
+ install_args+=("--accept-license")
+ fi
+ nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check ${install_args[@]+"${install_args[@]}"}
+ }
+
+ # Mount the driver rootfs into the run directory with the exception of sysfs.
+ _mount_rootfs() {
+ echo "Mounting NVIDIA driver rootfs..."
+ mount --make-runbindable /sys
+ mount --make-private /sys
+ mkdir -p ${RUN_DIR}/driver
+ mount --rbind / ${RUN_DIR}/driver
+ }
+
+ # Unmount the driver rootfs from the run directory.
+ _unmount_rootfs() {
+ echo "Unmounting NVIDIA driver rootfs..."
+ if findmnt -r -o TARGET | grep "${RUN_DIR}/driver" > /dev/null; then
+ umount -l -R ${RUN_DIR}/driver
+ fi
+ }
+
+ init() {
+ echo -e "\n========== NVIDIA Software Installer ==========\n"
+ echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n"
+
+ exec 3> ${PID_FILE}
+ if ! flock -n 3; then
+ echo "An instance of the NVIDIA driver is already running, aborting"
+ exit 1
+ fi
+ echo $$ >&3
+
+ trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
+ trap "_shutdown" EXIT
+
+ _unload_driver || exit 1
+ _unmount_rootfs
+
+ (
+ [ ! -f /lib/modules/$(uname -r)/kernel/drivers/video/nvidia.ko ] ||
+ [ ! -f /lib/modules/$(uname -r)/kernel/drivers/video/nvidia-uvm.ko ] ||
+ [ ! -f /lib/modules/$(uname -r)/kernel/drivers/video/nvidia-modeset.ko ]
+ ) && _install_driver
+
+ _load_driver
+ _mount_rootfs
+
+ echo "Done, now waiting for signal"
+ sleep infinity &
+ trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
+ trap - EXIT
+ while true; do wait $! || continue; done
+ exit 0
+ }
+
+ usage() {
+ cat >&2 <<EOF
+ Usage: $0 COMMAND [ARG...]
+
+ Commands:
+ init [-a | --accept-license]
+ EOF
+ exit 1
+ }
+
+ if [ $# -eq 0 ]; then
+ usage
+ fi
+ command=$1; shift
+ case "${command}" in
+ init) options=$(getopt -l accept-license -o a -- "$@") ;;
+ *) usage ;;
+ esac
+ if [ $? -ne 0 ]; then
+ usage
+ fi
+ eval set -- "${options}"
+
+ ACCEPT_LICENSE=""
+ KERNEL_VERSION=$(uname -r)
+ PRIVATE_KEY=""
+ PACKAGE_TAG=""
+
+ for opt in ${options}; do
+ case "$opt" in
+ -a | --accept-license) ACCEPT_LICENSE="yes"; shift 1 ;;
+ --) shift; break ;;
+ esac
+ done
+ if [ $# -ne 0 ]; then
+ usage
+ fi
+ $command;
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: nvidia-validator
+ namespace: gpu-operator-resources
+data:
+ cuda-workload-validation.yaml: |
+ apiVersion: v1
+ kind: Pod
+ metadata:
+ labels:
+ app: nvidia-cuda-validator
+ generateName: nvidia-cuda-validator-
+ namespace: gpu-operator-resources
+ spec:
+ tolerations:
+ - key: nvidia.com/gpu
+ operator: Exists
+ effect: NoSchedule
+ readOnlyRootFilesystem: true
+ restartPolicy: OnFailure
+ serviceAccount: nvidia-operator-validator
+ runtimeClassName: nvidia
+ initContainers:
+ - name: cuda-validation
+ image: "FILLED_BY_VALIDATOR"
+ imagePullPolicy: IfNotPresent
+ command: ['sh', '-c']
+ args: ["vectorAdd"]
+ securityContext:
+ allowPrivilegeEscalation: false
+ containers:
+ - name: nvidia-cuda-validator
+ image: "FILLED_BY_VALIDATOR"
+ imagePullPolicy: IfNotPresent
+ # override command and args as validation is already done by initContainer
+ command: ['sh', '-c']
+ args: ["echo cuda workload validation is successful"]
+ securityContext:
+ allowPrivilegeEscalation: false
+ plugin-workload-validation.yaml: |
+ apiVersion: v1
+ kind: Pod
+ metadata:
+ labels:
+ app: nvidia-device-plugin-validator
+ generateName: nvidia-device-plugin-validator-
+ namespace: gpu-operator-resources
+ spec:
+ tolerations:
+ - key: nvidia.com/gpu
+ operator: Exists
+ effect: NoSchedule
+ readOnlyRootFilesystem: true
+ restartPolicy: OnFailure
+ serviceAccount: nvidia-operator-validator
+ runtimeClassName: nvidia
+ initContainers:
+ - name: plugin-validation
+ image: "FILLED_BY_VALIDATOR"
+ imagePullPolicy: IfNotPresent
+ command: ['sh', '-c']
+ args: ["vectorAdd"]
+ securityContext:
+ allowPrivilegeEscalation: false
+ resources:
+ limits:
+ "FILLED_BY_VALIDATOR": 1
+ containers:
+ - name: nvidia-device-plugin-validator
+ image: "FILLED_BY_VALIDATOR"
+ imagePullPolicy: IfNotPresent
+ # override command and args as validation is already done by initContainer
+ command: ['sh', '-c']
+ args: ["echo device-plugin workload validation is successful"]
+ securityContext:
+ allowPrivilegeEscalation: false
+{{- end }}
diff --git a/deployments/gpu-operator/templates/clusterpolicy.yaml b/deployments/gpu-operator/templates/clusterpolicy.yaml
index c819a2e..a33cffb 100644
--- a/deployments/gpu-operator/templates/clusterpolicy.yaml
+++ b/deployments/gpu-operator/templates/clusterpolicy.yaml
@@ -152,7 +152,7 @@ spec:
args: {{ toYaml .Values.driver.args | nindent 6 }}
{{- end }}
toolkit:
- enabled: {{ .Values.toolkit.enabled }}
+ enabled: false
{{- if .Values.toolkit.repository }}
repository: {{ .Values.toolkit.repository }}
{{- end }}
@@ -354,4 +354,4 @@ spec:
{{- end }}
{{- if .Values.nodeStatusExporter.args }}
args: {{ toYaml .Values.nodeStatusExporter.args | nindent 6 }}
- {{- end }}
\ No newline at end of file
+ {{- end }}
diff --git a/deployments/gpu-operator/templates/operator.yaml b/deployments/gpu-operator/templates/operator.yaml
index c97b4b1..32234d8 100644
--- a/deployments/gpu-operator/templates/operator.yaml
+++ b/deployments/gpu-operator/templates/operator.yaml
@@ -50,29 +50,41 @@ spec:
mountPath: "/host-etc/os-release"
readOnly: true
- {{- if eq .Values.operator.include_assets "include_assets" }}
+ {{ if (.Values.operator.include_assets) and (eq .Values.operator.include_assets "True") }}
{{- range $path, $_ := .Files.Glob "assets/gpu-feature-discovery/*" }}
- name: assets
mountPath: {{ printf "/opt/gpu-operator/gpu-feature-discovery/%s" (base $path) }}
subPath: {{ printf "gfd_%s" (base $path) }}
{{- end }}
+ {{- range $path, $_ := .Files.Glob "assets/pre-requisites//*" }}
+ - name: assets
+ mountPath: {{ printf "/opt/gpu-operator/pre-requisites/%s" (base $path) }}
+ subPath: {{ printf "pre_requisites_%s" (base $path) }}
+ {{- end }}
+
{{- range $path, $_ := .Files.Glob "assets/state-container-toolkit/*" }}
- name: assets
mountPath: {{ printf "/opt/gpu-operator/state-container-toolkit/%s" (base $path) }}
subPath: {{ printf "state_container_toolkit_%s" (base $path) }}
{{- end }}
- {{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }}
+ {{- range $path, $_ := .Files.Glob "assets/state-dcgm-exporter/*" }}
- name: assets
- mountPath: {{ printf "/opt/gpu-operator/state-device-plugin/%s" (base $path) }}
- subPath: {{ printf "state_device_%s" (base $path) }}
+ mountPath: {{ printf "/opt/gpu-operator/state-dcgm-exporter/%s" (base $path) }}
+ subPath: {{ printf "state_dcgm_exporter_%s" (base $path) }}
{{- end }}
- {{- range $path, $_ := .Files.Glob "assets/state-device-plugin-validation/*" }}
+ {{- range $path, $_ := .Files.Glob "assets/state-dcgm/*" }}
- name: assets
- mountPath: {{ printf "/opt/gpu-operator/state-device-plugin-validation/%s" (base $path) }}
- subPath: {{ printf "state_device_validation_%s" (base $path) }}
+ mountPath: {{ printf "/opt/gpu-operator/state-dcgm/%s" (base $path) }}
+ subPath: {{ printf "state_dcgm_%s" (base $path) }}
+ {{- end }}
+
+ {{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }}
+ - name: assets
+ mountPath: {{ printf "/opt/gpu-operator/state-device-plugin/%s" (base $path) }}
+ subPath: {{ printf "state_device_plugin_%s" (base $path) }}
{{- end }}
{{- range $path, $_ := .Files.Glob "assets/state-driver/*" }}
@@ -81,10 +93,28 @@ spec:
subPath: {{ printf "state_driver_%s" (base $path) }}
{{- end }}
- {{- range $path, $_ := .Files.Glob "assets/state-monitoring/*" }}
+ {{- range $path, $_ := .Files.Glob "assets/state-mig-manager/*" }}
+ - name: assets
+ mountPath: {{ printf "/opt/gpu-operator/state-mig-manager/%s" (base $path) }}
+ subPath: {{ printf "state_mig_manager_%s" (base $path) }}
+ {{- end }}
+
+ {{- range $path, $_ := .Files.Glob "assets/state-node-status-exporter/*" }}
+ - name: assets
+ mountPath: {{ printf "/opt/gpu-operator/state-node-status-exporter/%s" (base $path) }}
+ subPath: {{ printf "state_node_status_exporter_%s" (base $path) }}
+ {{- end }}
+
+ {{- range $path, $_ := .Files.Glob "assets/state-operator-metrics/*" }}
+ - name: assets
+ mountPath: {{ printf "/opt/gpu-operator/state-operator-metrics/%s" (base $path) }}
+ subPath: {{ printf "state_operator_metrics_%s" (base $path) }}
+ {{- end }}
+
+ {{- range $path, $_ := .Files.Glob "assets/state-operator-validation/*" }}
- name: assets
- mountPath: {{ printf "/opt/gpu-operator/state-monitoring/%s" (base $path) }}
- subPath: {{ printf "state_monitor_%s" (base $path) }}
+ mountPath: {{ printf "/opt/gpu-operator/state-operator-validation/%s" (base $path) }}
+ subPath: {{ printf "state_operator_validation_%s" (base $path) }}
{{- end }}
{{- end }}
livenessProbe:
@@ -110,7 +140,7 @@ spec:
- name: host-os-release
hostPath:
path: "/etc/os-release"
- {{- if eq .Values.operator.include_assets "include_assets" }}
+ {{ if (.Values.operator.include_assets) and (eq .Values.operator.include_assets "True") }}
- name: assets
configMap:
name: operator-configmap
diff --git a/deployments/gpu-operator/templates/operator_confimap.yaml b/deployments/gpu-operator/templates/operator_confimap.yaml
new file mode 100644
index 0000000..6303960
--- /dev/null
+++ b/deployments/gpu-operator/templates/operator_confimap.yaml
@@ -0,0 +1,61 @@
+{{ if (.Values.operator.include_assets) and (eq .Values.operator.include_assets "True") }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: operator-configmap
+data:
+{{- range $path, $_ := .Files.Glob "assets/gpu-feature-discovery/*" }}
+{{ printf "gfd_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+
+{{- range $path, $_ := .Files.Glob "assets/pre-requisites//*" }}
+{{ printf "pre_requisites_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+
+{{- range $path, $_ := .Files.Glob "assets/state-container-toolkit/*" }}
+{{ printf "state_container_toolkit_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+
+{{- range $path, $_ := .Files.Glob "assets/state-dcgm-exporter/*" }}
+{{ printf "state_dcgm_exporter_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+
+{{- range $path, $_ := .Files.Glob "assets/state-dcgm/*" }}
+{{ printf "state_dcgm_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+
+{{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }}
+{{ printf "state_device_plugin_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+
+{{- range $path, $_ := .Files.Glob "assets/state-driver/*" }}
+{{ printf "state_driver_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+
+{{- range $path, $_ := .Files.Glob "assets/state-mig-manager/*" }}
+{{ printf "state_mig_manager_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+
+{{- range $path, $_ := .Files.Glob "assets/state-node-status-exporter/*" }}
+{{ printf "state_node_status_exporter_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+
+{{- range $path, $_ := .Files.Glob "assets/state-operator-metrics/*" }}
+{{ printf "state_operator_metrics_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+
+{{- range $path, $_ := .Files.Glob "assets/state-operator-validation/*" }}
+{{ printf "state_operator_validation_%s" (base $path) | indent 2 }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
+{{- end }}
diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml
index 6689636..e8157a1 100644
--- a/deployments/gpu-operator/values.yaml
+++ b/deployments/gpu-operator/values.yaml
@@ -11,6 +11,9 @@ nfd:
psp:
enabled: false
+toolkit-installer:
+ enabled: true
+
daemonsets:
priorityClassName: system-node-critical
tolerations:
@@ -45,7 +48,7 @@ operator:
imagePullPolicy: IfNotPresent
imagePullSecrets: []
priorityClassName: system-node-critical
- defaultRuntime: docker
+ defaultRuntime: containerd
runtimeClass: nvidia
initContainer:
image: cuda
@@ -70,8 +73,7 @@ operator:
values: [""]
logging:
timeEncoding: epoch
- # Set "include_assets" true to include assets/gpu-operator with the helm chart
- include_assets: ""
+ include_assets: "True"
resources:
limits:
cpu: 500m
@@ -127,10 +129,10 @@ driver:
config: ""
toolkit:
- enabled: true
+ enabled: false
repository: nvcr.io/nvidia/k8s
image: container-toolkit
- version: 1.6.0-ubuntu18.04
+ version: 1.7.1-ubi8
imagePullPolicy: IfNotPresent
imagePullSecrets: []
env: []
@@ -255,3 +257,6 @@ node-feature-discovery:
serviceAccount:
name: node-feature-discovery
+
+global:
+ toolkit_force_clean: false
--
2.17.1

View File

@ -0,0 +1,2 @@
deployments-setup-configmap-with-assets-for-volumemo.patch
enablement-support-on-starlingx-cloud-platform.patch