ceph: fixes to deployment and upgrade

1) ceph-nfs (ganesha-ceph) - use NFSv4 only
This is recommended upstream.
v3 and UDP require portmapper (aka rpcbind) which we
do not want, except where Ubuntu ganesha version (2.6)
forces it by requiring enabled UDP, see [1].
The issue has been fixed in 2.8, included in CentOS.
Additionally disable v3 helper protocols and kerberos
to avoid meaningless warnings.

2) ceph-nfs (ganesha-ceph) - do not export host dbus
It is not in use. This avoids the temptation to try
handling it on host.

3) Properly handle ceph services deploy and upgrade
Upgrade runs deploy.
The order has been corrected - nfs goes after mds.
Additionally upgrade takes care of rgw for keystone
(for swift emulation).

4) Enhance ceph keyring module with error detection
Now it does not blindly try to create a keyring after
any failure. This used to hide real issue.

5) Retry ceph admin keyring update until cluster works
Reordering deployment caused issue with ceph cluster not being
fully operational before taking actions on it.

6) CI: Remove osd df from collected logs as it may hang CI
Hangs are caused by healthy MON and no healthy MGR.
A descriptive note is left in its place.

7) CI: Add 5s timeout to ceph informational commands
This decreases the timeout from the default 300s.

[1] https://review.opendev.org/669315

Change-Id: I1cf0ad10b80552f503898e723f0c4bd00a38f143
Signed-off-by: Radosław Piliszek <radoslaw.piliszek@gmail.com>
This commit is contained in:
Radosław Piliszek 2019-07-28 11:41:46 +02:00
parent 68542d0b4a
commit 826f6850d0
6 changed files with 48 additions and 54 deletions

View File

@ -15,6 +15,7 @@
# limitations under the License. # limitations under the License.
import json import json
import re
import subprocess # nosec import subprocess # nosec
@ -51,13 +52,16 @@ EXAMPLES = '''
name: client.admin name: client.admin
container_name: ceph_mon container_name: ceph_mon
caps: caps:
mds: 'allow' mds: 'allow *'
mon: 'allow *' mon: 'allow *'
osd: 'allow *' osd: 'allow *'
mgr: 'allow *' mgr: 'allow *'
''' '''
enoent_re = re.compile(r"\bENOENT\b")
class CephKeyring(object): class CephKeyring(object):
def __init__(self, name, caps, container_name='ceph_mon'): def __init__(self, name, caps, container_name='ceph_mon'):
self.name = name self.name = name
@ -93,7 +97,10 @@ class CephKeyring(object):
def ensure_keyring(self): def ensure_keyring(self):
try: try:
stdout = self.get_keyring() stdout = self.get_keyring()
except subprocess.CalledProcessError: except subprocess.CalledProcessError as e:
if e.returncode != 2 or not enoent_re.search(e.output):
# this is not a missing keyring case
raise
# keyring doesn't exsit, try to create it # keyring doesn't exsit, try to create it
stdout = self.create_keyring() stdout = self.create_keyring()
self.changed = True self.changed = True

View File

@ -1,4 +1,6 @@
--- ---
# NOTE(yoctozepto): this file is used during upgrade as well
- include_tasks: config.yml - include_tasks: config.yml
- include_tasks: bootstrap_mons.yml - include_tasks: bootstrap_mons.yml
@ -9,19 +11,8 @@
- include_tasks: start_mons.yml - include_tasks: start_mons.yml
when: inventory_hostname in groups['ceph-mon'] when: inventory_hostname in groups['ceph-mon']
- include_tasks: start_mgrs.yml # NOTE(yoctozepto): this ensures caps for admin are always up-to-date (run as earliest as possible = after MONs start)
when: inventory_hostname in groups['ceph-mgr'] # this is retried because the cluster might not be fully operational yet (quorum gathering)
- include_tasks: start_ceph_dashboard.yml
when:
- enable_ceph_dashboard | bool
- inventory_hostname in groups['ceph-mon']
- include_tasks: start_nfss.yml
when:
- enable_ceph_nfs | bool
- inventory_hostname in groups['ceph-nfs']
- name: configuring client.admin caps - name: configuring client.admin caps
become: true become: true
kolla_ceph_keyring: kolla_ceph_keyring:
@ -29,6 +20,18 @@
caps: "{{ ceph_client_admin_keyring_caps }}" caps: "{{ ceph_client_admin_keyring_caps }}"
run_once: True run_once: True
delegate_to: "{{ groups['ceph-mon'][0] }}" delegate_to: "{{ groups['ceph-mon'][0] }}"
register: result
until: result is success
retries: 3
delay: 15
- include_tasks: start_mgrs.yml
when: inventory_hostname in groups['ceph-mgr']
- include_tasks: start_ceph_dashboard.yml
when:
- enable_ceph_dashboard | bool
- inventory_hostname in groups['ceph-mon']
- include_tasks: bootstrap_osds.yml - include_tasks: bootstrap_osds.yml
when: inventory_hostname in groups['ceph-osd'] when: inventory_hostname in groups['ceph-osd']
@ -50,3 +53,9 @@
when: when:
- enable_ceph_mds | bool - enable_ceph_mds | bool
- inventory_hostname in groups['ceph-mds'] - inventory_hostname in groups['ceph-mds']
# NOTE(yoctozepto): nfs (cephfs-based) depends on mds so start it after
- include_tasks: start_nfss.yml
when:
- enable_ceph_nfs | bool
- inventory_hostname in groups['ceph-nfs']

View File

@ -9,5 +9,4 @@
volumes: volumes:
- "{{ node_config_directory }}/ceph-nfs/:{{ container_config_directory }}/:ro" - "{{ node_config_directory }}/ceph-nfs/:{{ container_config_directory }}/:ro"
- "/etc/localtime:/etc/localtime:ro" - "/etc/localtime:/etc/localtime:ro"
- "/var/run/dbus:/var/run/dbus"
- "kolla_logs:/var/log/kolla/" - "kolla_logs:/var/log/kolla/"

View File

@ -1,39 +1,5 @@
--- ---
- include_tasks: config.yml - include_tasks: deploy.yml
# NOTE(jeffrey4l): client.admin caps should be update when upgrade from Jewel
# to Luminous
- name: configuring client.admin caps
become: true
kolla_ceph_keyring:
name: client.admin
caps: "{{ ceph_client_admin_keyring_caps }}"
run_once: True
delegate_to: "{{ groups['ceph-mon'][0] }}"
- include_tasks: start_mons.yml
when: inventory_hostname in groups['ceph-mon']
- include_tasks: start_mgrs.yml
when: inventory_hostname in groups['ceph-mgr']
- include_tasks: start_osds.yml
when: inventory_hostname in groups['ceph-osd']
- include_tasks: start_rgws.yml
when:
- enable_ceph_rgw | bool
- inventory_hostname in groups['ceph-rgw']
- include_tasks: start_mdss.yml
when:
- enable_ceph_mds | bool
- inventory_hostname in groups['ceph-mds']
- include_tasks: start_nfss.yml
when:
- enable_ceph_nfs | bool
- inventory_hostname in groups['ceph-nfs']
- name: Check final release (as running on MONs) - name: Check final release (as running on MONs)
become: true become: true

View File

@ -1,3 +1,14 @@
NFS_CORE_PARAM {
Protocols = 4;
Enable_NLM = false;
Enable_RQUOTA = false;
Enable_UDP = false;
}
NFS_KRB5 {
Active_krb5 = false;
}
EXPORT EXPORT
{ {
Export_id=1; Export_id=1;

View File

@ -43,9 +43,11 @@ copy_logs() {
# ceph related logs # ceph related logs
if [[ $(docker ps --filter name=ceph_mon --format "{{.Names}}") ]]; then if [[ $(docker ps --filter name=ceph_mon --format "{{.Names}}") ]]; then
docker exec ceph_mon ceph -s > ${LOG_DIR}/kolla/ceph/ceph_s.txt docker exec ceph_mon ceph --connect-timeout 5 -s > ${LOG_DIR}/kolla/ceph/ceph_s.txt
docker exec ceph_mon ceph osd df > ${LOG_DIR}/kolla/ceph/ceph_osd_df.txt # NOTE(yoctozepto): osd df removed on purpose to avoid CI POST_FAILURE due to a possible hang:
docker exec ceph_mon ceph osd tree > ${LOG_DIR}/kolla/ceph/ceph_osd_tree.txt # as of ceph mimic it hangs when MON is operational but MGR not
# its usefulness is mediocre and having POST_FAILUREs is bad
docker exec ceph_mon ceph --connect-timeout 5 osd tree > ${LOG_DIR}/kolla/ceph/ceph_osd_tree.txt
fi fi
# bifrost related logs # bifrost related logs