430 lines
17 KiB
YAML
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# The name of the Kubernetes cluster we are deploying to
# Defaults to the release name if not given for use as a dependency of openstack-cluster
clusterName: "{{ .Release.Name }}"
# The Kubernetes version of the target cluster
# This is treated as a template at rendering time
kubernetesVersion: v1.22
# Indicates whether the addons are being deployed as part of a Cluster API cluster
# If true then addons will wait for the cluster to become ready before installing, except
# for the bootstrap addons which just wait for the API to become available
clusterApi: false
# Details of a secret containing a kubeconfig file for a remote cluster
# If given, this is used in preference to a service account
kubeconfigSecret:
# The name of the secret
# This is treated as a template during rendering
name:
# The key of the kubeconfig file in the secret
key: value
# Options for the service account to use
# A pre-existing service account can be used, or a new one can be created
#
# A service account is always required as it is used by the pre-delete hook
# to suspend any install jobs that are still running prior to running the deletion
#
# The permissions required by the service account depend on whether the installation
# is targetting a remote cluster or the local cluster
#
# Whether the installation target is local or remote, the service account needs to
# have permission to list and patch jobs in the release namespace for the delete hook
# in order to suspend any running install jobs
#
# When the installation targets the local cluster, the service account must also have
# permission to create any resources that need to be installed, which could be into
# other namespaces - the cluster-admin cluster role is normally used for this
serviceAccount:
# Indicates whether to create a new service account
create: true
# The name of the cluster role to bind the created service account to
clusterRoleName: cluster-admin
# The name of the service account
# If create = true, this is the name of the created service account
# If create = false, this is the name of an existing service account to use
# This is treated as a template during rendering
name: "{{ include \"cluster-addons.fullname\" . }}-deployer"
# Default settings for jobs
jobDefaults:
image:
repository: ghcr.io/stackhpc/k8s-utils
tag: # Defaults to chart appVersion if not given
pullPolicy: IfNotPresent
imagePullSecrets: []
backoffLimit: 1000
activeDeadlineSeconds: 3600
podSecurityContext:
runAsNonRoot: true
securityContext:
allowPrivilegeEscalation: false
resources: {}
hostNetwork: false
tolerations: []
nodeSelector: {}
affinity: {}
# The available categories for dependencies and the addons that belong to them
categories:
bootstrap:
- cloud-config
- ccm-openstack
- cni-calico
- cni-cilium
- prometheus-operator-crds
storage: [csi-cinder]
ingress: [ingress-nginx]
# Settings for the CNI addon
cni:
# Indicates if a CNI should be deployed
enabled: true
# The type of CNI to deploy - supported values are calico or cilium
type: calico
# Settings for the calico CNI
calico:
chart:
repo: https://projectcalico.docs.tigera.io/charts
name: tigera-operator
version: v3.23.3
release:
namespace: tigera-operator
# See https://projectcalico.docs.tigera.io/getting-started/kubernetes/helm
values:
# Managing the installation separately makes deriving the pod CIDR cleaner
installation:
enabled: false
# The spec of the Calico installation
# See https://projectcalico.docs.tigera.io/reference/installation/api
installation:
calicoNetwork:
# By default, disable BGP
bgp: Disabled
# Use the interface that holds the Kubernetes internal IP
nodeAddressAutodetectionV4:
kubernetes: NodeInternalIP
# Use a single IP pool with VXLAN
# The special variable __KUBEADM_POD_CIDR__ is replaced with the pod CIDR from the
# kubeadm configmap, if kubeadm is in use
ipPools:
- cidr: __KUBEADM_POD_CIDR__
encapsulation: VXLAN
# Settings for the Cilium CNI
cilium:
chart:
repo: https://helm.cilium.io/
name: cilium
version: 1.11.1
release:
namespace: kube-system
# See https://docs.cilium.io/en/stable/gettingstarted/k8s-install-helm/ for details
values:
ipam:
mode: kubernetes
# Settings for the OpenStack integrations
openstack:
# Indicates if the OpenStack integrations should be enabled
enabled: false
# The version of the OpenStack cloud provider to install
# By default, use the release branch for the Kubernetes version of the target cluster
version: release-{{ tpl .Values.kubernetesVersion . | trimPrefix "v" }}
# The base URL for OpenStack cloud provider manifests
# By default, pull the manifests from GitHub at the specified version
manifestsBaseURL: https://raw.githubusercontent.com/kubernetes/cloud-provider-openstack/{{ tpl .Values.openstack.version . }}
# The name of a secret containing a clouds.yaml file and optional cacert
# If the cacert is present, it should be referred to in the clouds.yaml file as /etc/config/cacert
# See https://docs.openstack.org/openstacksdk/latest/user/config/configuration.html#ssl-settings
cloudCredentialsSecretName:
# The name of the cloud to use in the clouds.yaml
cloudName: openstack
# cloud-config options for the OpenStack integrations
# The [Global] section is configured to use the specified cloud from .Values.clouds
# See https://github.com/kubernetes/cloud-provider-openstack/blob/master/docs/openstack-cloud-controller-manager/using-openstack-cloud-controller-manager.md#config-openstack-cloud-controller-manager
# and https://github.com/kubernetes/cloud-provider-openstack/blob/master/docs/cinder-csi-plugin/using-cinder-csi-plugin.md#block-storage
cloudConfig:
# By default, ignore volume AZs for Cinder as most clouds have a single globally-attachable Cinder AZ
BlockStorage:
ignore-volume-az: true
# Settings for the Cloud Controller Manager (CCM)
ccm:
# Indicates if the OpenStack CCM should be enabled
# By default, the CCM is enabled if the OpenStack integrations are enabled
enabled: true
# The prefix for RBAC manifests
# Unfortunately, this changes for different Kubernetes versions
rbacManifestsPrefix: >-
{{
tpl .Values.kubernetesVersion . |
trimPrefix "v" |
semverCompare ">=1.22" |
ternary "manifests/controller-manager" "cluster/addons/rbac"
}}
# The URLs to use for the manifests
manifests:
- "{{ tpl .Values.openstack.manifestsBaseURL . }}/{{ tpl .Values.openstack.ccm.rbacManifestsPrefix . }}/cloud-controller-manager-roles.yaml"
- "{{ tpl .Values.openstack.manifestsBaseURL . }}/{{ tpl .Values.openstack.ccm.rbacManifestsPrefix . }}/cloud-controller-manager-role-bindings.yaml"
- "{{ tpl .Values.openstack.manifestsBaseURL . }}/manifests/controller-manager/openstack-cloud-controller-manager-ds.yaml"
# Any kustomization to apply to the OpenStack CCM manifests
kustomization: {}
# Settings for the Cinder CSI plugin
csiCinder:
# Indicates if the Cinder CSI should be enabled
# By default, it is enabled if the OpenStack integrations are enabled
enabled: true
# The URLs to use for the manifests
manifests:
- "{{ tpl .Values.openstack.manifestsBaseURL . }}/manifests/cinder-csi-plugin/cinder-csi-controllerplugin-rbac.yaml"
- "{{ tpl .Values.openstack.manifestsBaseURL . }}/manifests/cinder-csi-plugin/cinder-csi-controllerplugin.yaml"
- "{{ tpl .Values.openstack.manifestsBaseURL . }}/manifests/cinder-csi-plugin/cinder-csi-nodeplugin-rbac.yaml"
- "{{ tpl .Values.openstack.manifestsBaseURL . }}/manifests/cinder-csi-plugin/cinder-csi-nodeplugin.yaml"
- "{{ tpl .Values.openstack.manifestsBaseURL . }}/manifests/cinder-csi-plugin/csi-cinder-driver.yaml"
# Any kustomization to apply to the OpenStack CCM manifests
kustomization: {}
# Variables affecting the definition of the storage class
storageClass:
# Indicates if the storage class should be enabled
enabled: true
# The name of the storage class
name: csi-cinder
# Indicates if the storage class should be annotated as the default storage class
isDefault: true
# The reclaim policy for the storage class
reclaimPolicy: Delete
# Indicates if volume expansion is allowed
allowVolumeExpansion: true
# The Cinder availability zone to use for volumes provisioned by the storage class
availabilityZone: nova
# The Cinder volume type to use for volumes provisioned by the storage class
# If not given, the default volume type will be used
volumeType:
# The allowed topologies for the storage class
allowedTopologies:
# Settings for the metrics server
metricsServer:
# Indicates if the metrics server should be deployed
enabled: true
# The version of the metrics server to deploy
version: v0.6.1
# The URLs of the metrics server manifests
manifests:
- https://github.com/kubernetes-sigs/metrics-server/releases/download/{{ .Values.metricsServer.version }}/components.yaml
# Any kustomization to be applied to the metrics server manifests
kustomization:
patches:
- patch: |-
- op: add
path: /spec/template/spec/containers/0/args/-
value: --kubelet-insecure-tls
target:
kind: Deployment
name: metrics-server
# Settings for the Kubernetes dashboard
kubernetesDashboard:
# Indicates if the Kubernetes dashboard should be enabled
enabled: false
chart:
repo: https://kubernetes.github.io/dashboard
name: kubernetes-dashboard
version: 5.3.1
release:
namespace: kubernetes-dashboard
values:
# Enable the metrics scraper by default
metricsScraper:
enabled: true
# Settings for cert-manager
certManager:
# Indicates if cert-manager should be enabled
enabled: false
chart:
repo: https://charts.jetstack.io
name: cert-manager
version: v1.5.5
release:
namespace: cert-manager
# See https://cert-manager.io/docs/installation/helm/ for available values
values:
# By default, make sure the cert-manager CRDs are installed
installCRDs: true
# Disable Prometheus support for now
prometheus:
enabled: false
# Settings for automatic ACME HTTP01 support using Let's Encrypt
# This is only enabled if ingress is also enabled
acmeHttp01Issuer:
enabled: yes
name: letsencrypt-http01
server: https://acme-v02.api.letsencrypt.org/directory
# Settings for ingress controllers
ingress:
# Indicates if ingress controllers should be enabled
enabled: false
# Settings for the Nginx ingress controller
nginx:
# Indicates if the Nginx ingress controller should be enabled
enabled: true
chart:
repo: https://kubernetes.github.io/ingress-nginx
name: ingress-nginx
version: 4.0.18
release:
namespace: ingress-nginx
# See https://github.com/kubernetes/ingress-nginx/tree/main/charts/ingress-nginx#configuration
values: {}
# Settings for cluster monitoring
monitoring:
# Indicates if the cluster monitoring should be enabled
enabled: false
prometheusOperatorCrds:
- https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.54.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml
- https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.54.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml
- https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.54.0/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml
- https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.54.0/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml
- https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.54.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml
- https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.54.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml
- https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.54.0/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml
- https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.54.0/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml
kubePrometheusStack:
chart:
repo: https://prometheus-community.github.io/helm-charts
name: kube-prometheus-stack
version: 34.6.0
release:
namespace: monitoring-system
values: {}
lokiStack:
enabled: true
chart:
repo: https://grafana.github.io/helm-charts
name: loki-stack
version: 2.6.1
release:
namespace: monitoring-system
values: {}
# Settings for node feature discovery
nodeFeatureDiscovery:
# Indicates if node feature discovery should be enabled
enabled: true
chart:
repo: https://kubernetes-sigs.github.io/node-feature-discovery/charts
name: node-feature-discovery
version: 0.11.0
release:
namespace: node-feature-discovery
values:
master:
extraLabelNs:
- nvidia.com
worker:
# Allow the NFD pods to be scheduled on master nodes
tolerations:
- key: "node-role.kubernetes.io/master"
operator: "Equal"
value: ""
effect: "NoSchedule"
- key: "nvidia.com/gpu"
operator: "Equal"
value: "present"
effect: "NoSchedule"
# We want to be able to identify nodes with high-performance hardware
# So the whitelisted device classes are:
# 02 - Network Controllers (e.g. Ethernet, Infiniband)
# 03 - Display Controllers (e.g. GPUs)
# 0b40 - Co-processors
# 12 - Processing Accelerators (e.g. specialised AI inference chips)
config:
sources:
pci:
deviceClassWhitelist:
- "02"
- "03"
- "0b40"
- "12"
deviceLabelFields:
- vendor
# Settings for the NVIDIA GPU operator
nvidiaGPUOperator:
# Indicates if the NVIDIA GPU operator should be enabled
# Note that because it uses node feature discovery to run only on nodes
# with an NVIDIA GPU available, the overhead of enabling this on clusters
# that do not need it now but may need it in the future is low
enabled: true
chart:
repo: https://nvidia.github.io/gpu-operator
name: gpu-operator
version: v1.10.0
release:
namespace: gpu-operator
values:
# Use the shared NFD
nfd:
enabled: false
# Export operator and node metrics in a Prometheus format.
# The component provides information on the status of the
# operator (e.g. reconciliation status, number of GPU enabled nodes).
nodeStatusExporter:
enabled: true
toolkit:
# Allowing the toolkit to edit /etc/containerd/config.toml (the default)
# breaks nvidia pod deployment on clusters with Harbor cache enabled.
# Instead make a new config file specifically for nvidia runtime config,
# which is parsed as an "include" in the main containerd config file.
#
# https://github.com/NVIDIA/gpu-operator/issues/301
env:
- name: "CONTAINERD_CONFIG"
value: "/etc/containerd/conf.d/nvidia.toml"
# Settings for the Mellanox network operator
mellanoxNetworkOperator:
# Indicates if the network operator should be enabled
# Note that because it uses node feature discovery to run only on nodes
# with a Mellanox NIC available, the overhead of enabling this on clusters
# that do not need it now but may need it in the future is low
enabled: true
chart:
repo: https://mellanox.github.io/network-operator
name: network-operator
version: 1.1.0
release:
namespace: network-operator
values:
# Use the shared NFD
nfd:
enabled: false
# Deploy the default NICClusterPolicy
deployCR: true
# Deploy the OFED driver onto nodes with a suitable NIC
ofedDriver:
deploy: true
# OFED takes ages to deploy on low-resource nodes
# The startup probe has a fixed failure threshold of 60
# So in order to give the drivers up to one hour to install, we use a period
# of 60 seconds for the startup probe
startupProbe:
initialDelaySeconds: 60
periodSeconds: 60
# Deploy the RDMA shared device plugin to allow pods to access the RDMA device
rdmaSharedDevicePlugin:
deploy: true
# Disable all other features for now
sriovNetworkOperator:
enabled: false
sriovDevicePlugin:
deploy: false
secondaryNetwork:
deploy: false
# Map of extra addons in the form "component name" -> "addon spec"
extraAddons: {}