Chinasubbareddy Mallavarapu 321b8cb7e3 [ceph-osd] Logic improvement for used osd disk detection
This is to improve the logic to detect used osd disks so that scripts will
not zap the osd disks agressively.

also adding debugging mode for pvdisplay commands to capture more logs
during failure scenarios along with reading osd force repair flag from
values.

Change-Id: Id2996211dd92ac963ad531f8671a7cc8f7b7d2d5
2020-10-15 13:13:28 +00:00

383 lines
11 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Default values for ceph-osd.
# This is a YAML-formatted file.
# Declare name/value pairs to be passed into your templates.
# name: value
---
images:
pull_policy: IfNotPresent
tags:
ceph_osd: 'docker.io/openstackhelm/ceph-daemon:ubuntu_bionic-20200521'
ceph_bootstrap: 'docker.io/openstackhelm/ceph-daemon:ubuntu_bionic-20200521'
ceph_config_helper: 'docker.io/openstackhelm/ceph-config-helper:ubuntu_bionic-20200521'
dep_check: 'quay.io/airshipit/kubernetes-entrypoint:v1.0.0'
image_repo_sync: 'docker.io/docker:17.07.0'
local_registry:
active: false
exclude:
- dep_check
- image_repo_sync
labels:
job:
node_selector_key: openstack-control-plane
node_selector_value: enabled
test:
node_selector_key: openstack-control-plane
node_selector_value: enabled
osd:
node_selector_key: ceph-osd
node_selector_value: enabled
# We could deploy ceph cluster now with either ceph-volume or ceph-disk however
# ceph-disk is deprecated from Nautilus.
# Keeping ceph-disk as default since gate scripts are still directory backed
# osds, need to change this after moving the gates to disk backed osd.
deploy:
tool: "ceph-volume"
# NOTE: set this to 1 if osd disk needs wiping in case of reusing from previous deployment
osd_force_repair: 1
pod:
security_context:
osd:
pod:
runAsUser: 65534
container:
ceph_init_dirs:
runAsUser: 0
readOnlyRootFilesystem: true
ceph_log_ownership:
runAsUser: 0
readOnlyRootFilesystem: true
osd_init:
runAsUser: 0
privileged: true
readOnlyRootFilesystem: true
osd_pod:
runAsUser: 0
privileged: true
readOnlyRootFilesystem: true
log_runner:
runAsUser: 0
readOnlyRootFilesystem: true
bootstrap:
pod:
runAsUser: 65534
container:
ceph_osd_bootstrap:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
post_apply:
pod:
runAsUser: 65534
container:
ceph_osd_post_apply:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
test:
pod:
runAsUser: 65534
container:
ceph_cluster_helm_test:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
dns_policy: "ClusterFirstWithHostNet"
lifecycle:
upgrades:
daemonsets:
pod_replacement_strategy: RollingUpdate
osd:
enabled: true
min_ready_seconds: 0
max_unavailable: 1
affinity:
anti:
type:
default: preferredDuringSchedulingIgnoredDuringExecution
topologyKey:
default: kubernetes.io/hostname
weight:
default: 10
resources:
enabled: false
osd:
requests:
memory: "2Gi"
cpu: "1000m"
limits:
memory: "5Gi"
cpu: "2000m"
tests:
requests:
memory: "10Mi"
cpu: "250m"
limits:
memory: "50Mi"
cpu: "500m"
jobs:
image_repo_sync:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "1024Mi"
cpu: "2000m"
secrets:
keyrings:
osd: ceph-bootstrap-osd-keyring
admin: ceph-client-admin-keyring
network:
public: 192.168.0.0/16
cluster: 192.168.0.0/16
jobs:
ceph_defragosds:
# Execute the 1st of each month
cron: "0 0 1 * *"
history:
# Number of successful job to keep
successJob: 1
# Number of failed job to keep
failJob: 1
concurrency:
# Skip new job if previous job still active
execPolicy: Forbid
startingDeadlineSecs: 60
conf:
ceph:
global:
# auth
cephx: true
cephx_require_signatures: false
cephx_cluster_require_signatures: true
cephx_service_require_signatures: false
objecter_inflight_op_bytes: "1073741824"
objecter_inflight_ops: 10240
debug_ms: "0/0"
mon_osd_down_out_interval: 1800
mon_osd_down_out_subtree_limit: root
mon_osd_min_in_ratio: 0
mon_osd_min_up_ratio: 0
osd:
osd_mkfs_type: xfs
osd_mkfs_options_xfs: -f -i size=2048
osd_max_object_name_len: 256
ms_bind_port_min: 6800
ms_bind_port_max: 7100
osd_snap_trim_priority: 1
osd_snap_trim_sleep: 0.1
osd_pg_max_concurrent_snap_trims: 1
filestore_merge_threshold: -10
filestore_split_multiple: 12
filestore_max_sync_interval: 10
osd_scrub_begin_hour: 22
osd_scrub_end_hour: 4
osd_scrub_during_recovery: false
osd_scrub_sleep: 0.1
osd_scrub_chunk_min: 1
osd_scrub_chunk_max: 4
osd_scrub_load_threshold: 10.0
osd_deep_scrub_stride: "1048576"
osd_scrub_priority: 1
osd_recovery_op_priority: 1
osd_recovery_max_active: 1
osd_mount_options_xfs: "rw,noatime,largeio,inode64,swalloc,logbufs=8,logbsize=256k,allocsize=4M"
osd_journal_size: 10240
osd_crush_update_on_start: false
target:
# This is just for helm tests to proceed the deployment if we have mentioned % of
# osds are up and running.
required_percent_of_osds: 75
storage:
# NOTE(supamatt): By default use host based buckets for failure domains. Any `failure_domain` defined must
# match the failure domain used on your CRUSH rules for pools. For example with a crush rule of
# rack_replicated_rule you would specify "rack" as the `failure_domain` to use.
# `failure_domain`: Set the CRUSH bucket type for your OSD to reside in. See the supported CRUSH configuration
# as listed here: Supported CRUSH configuration is listed here: http://docs.ceph.com/docs/nautilus/rados/operations/crush-map/
# if failure domain is rack then it will check for node label "rack" and get the value from it to create the rack, if there
# is no label rack then it will use following options.
# `failure_domain_by_hostname`: Specify the portion of the hostname to use for your failure domain bucket name.
# `failure_domain_by_hostname_map`: Explicit mapping of hostname to failure domain, as a simpler alternative to overrides.
# `failure_domain_name`: Manually name the failure domain bucket name. This configuration option should only be used
# when using host based overrides.
failure_domain: "host"
failure_domain_by_hostname: "false"
failure_domain_by_hostname_map: {}
# Example:
# failure_domain_map_hostname_map:
# hostfoo: rack1
# hostbar: rack1
# hostbaz: rack2
# hostqux: rack2
failure_domain_name: "false"
# Note: You can override the device class by adding the value (e.g., hdd, ssd or nvme).
# Leave it empty if you don't need to modify the device class.
device_class: ""
# NOTE(portdirect): for homogeneous clusters the `osd` key can be used to
# define OSD pods that will be deployed across the cluster.
# when specifing whole disk (/dev/sdf) for journals, ceph-osd chart will create
# needed partitions for each OSDs.
osd:
- data:
type: directory
location: /var/lib/openstack-helm/ceph/osd/osd-one
journal:
type: directory
location: /var/lib/openstack-helm/ceph/osd/journal-one
# - data:
# type: bluestore
# location: /dev/sdb
# Separate block devices may be used for block.db and/or block.wal
# Without these values they will be co-located on the data volume
# Specify the location and size in Gb. It is recommended that the
# block_db size isnt smaller than 4% of block. For example, if the
# block size is 1TB, then block_db shouldnt be less than 40GB.
# A size suffix of K for kilobytes, M for megabytes, G for gigabytes,
# T for terabytes, P for petabytes or E for exabytes is optional.
# Default unit is megabytes.
# block_db:
# location: /dev/sdc
# size: "96GB"
# block_wal:
# location: /dev/sdc
# size: "2GB"
# - data:
# type: block-logical
# location: /dev/sdd
# journal:
# type: block-logical
# location: /dev/sdf1
# - data:
# type: block-logical
# location: /dev/sde
# journal:
# type: block-logical
# location: /dev/sdf2
# - data:
# type: block-logical
# location: /dev/sdg
# journal:
# type: directory
# location: /var/lib/openstack-helm/ceph/osd/journal-sdg
# NOTE(portdirect): for heterogeneous clusters the overrides section can be used to define
# OSD pods that will be deployed upon specifc nodes.
# overrides:
# ceph_osd:
# hosts:
# - name: host1.fqdn
# conf:
# storage:
# failure_domain_name: "rack1"
# osd:
# - data:
# type: directory
# location: /var/lib/openstack-helm/ceph/osd/data-three
# journal:
# type: directory
# location: /var/lib/openstack-helm/ceph/osd/journal-three
daemonset:
prefix_name: "osd"
dependencies:
dynamic:
common:
local_image_registry:
jobs:
- ceph-osd-image-repo-sync
services:
- endpoint: node
service: local_image_registry
static:
osd:
jobs:
- ceph-storage-keys-generator
- ceph-osd-keyring-generator
services:
- endpoint: internal
service: ceph_mon
image_repo_sync:
services:
- endpoint: internal
service: local_image_registry
tests:
jobs:
- ceph-storage-keys-generator
- ceph-osd-keyring-generator
services:
- endpoint: internal
service: ceph_mon
logging:
truncate:
size: 0
period: 3600
osd_id:
timeout: 300
bootstrap:
enabled: true
script: |
ceph -s
endpoints:
cluster_domain_suffix: cluster.local
local_image_registry:
name: docker-registry
namespace: docker-registry
hosts:
default: localhost
internal: docker-registry
node: localhost
host_fqdn_override:
default: null
port:
registry:
node: 5000
ceph_mon:
namespace: null
hosts:
default: ceph-mon
discovery: ceph-mon-discovery
host_fqdn_override:
default: null
port:
mon:
default: 6789
mon_msgr2:
default: 3300
manifests:
configmap_bin: true
configmap_etc: true
configmap_test_bin: true
daemonset_osd: true
job_bootstrap: false
job_post_apply: true
job_image_repo_sync: true
helm_tests: true
...