6a96c4fe72
ceph recipe is in meta-virtualization, however, StarlingX has made many modifications, so needs extra effort to port these using bbappend. Signed-off-by: Litao Gao <litao.gao@windriver.com>
335 lines
11 KiB
Python
335 lines
11 KiB
Python
#!/usr/bin/python
|
|
#
|
|
# Copyright (c) 2019 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
|
|
import ast
|
|
import os
|
|
import os.path
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
|
|
DEVICE_NAME_NVME = "nvme"
|
|
|
|
#########
|
|
# Utils #
|
|
#########
|
|
|
|
|
|
def command(arguments, **kwargs):
|
|
"""Execute e command and capture stdout, stderr & return code"""
|
|
process = subprocess.Popen(
|
|
arguments,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
**kwargs)
|
|
out, err = process.communicate()
|
|
return out, err, process.returncode
|
|
|
|
|
|
def get_input(arg, valid_keys):
|
|
"""Convert the input to a dict and perform basic validation"""
|
|
json_string = arg.replace("\\n", "\n")
|
|
try:
|
|
input_dict = ast.literal_eval(json_string)
|
|
if not all(k in input_dict for k in valid_keys):
|
|
return None
|
|
except Exception:
|
|
return None
|
|
|
|
return input_dict
|
|
|
|
|
|
def get_partition_uuid(dev):
|
|
output, _, _ = command(['blkid', dev])
|
|
try:
|
|
return re.search('PARTUUID=\"(.+?)\"', output).group(1)
|
|
except AttributeError:
|
|
return None
|
|
|
|
|
|
def device_path_to_device_node(device_path):
|
|
try:
|
|
output, _, _ = command(["udevadm", "settle", "-E", device_path])
|
|
out, err, retcode = command(["readlink", "-f", device_path])
|
|
out = out.rstrip()
|
|
except Exception as e:
|
|
return None
|
|
|
|
return out
|
|
|
|
|
|
###########################################
|
|
# Manage Journal Disk Partitioning Scheme #
|
|
###########################################
|
|
|
|
DISK_BY_PARTUUID = "/dev/disk/by-partuuid/"
|
|
JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-b4b80ceff106' # Type of a journal partition
|
|
|
|
|
|
def is_partitioning_correct(disk_path, partition_sizes):
|
|
"""Validate the existence and size of journal partitions"""
|
|
|
|
# Obtain the device node from the device path.
|
|
disk_node = device_path_to_device_node(disk_path)
|
|
|
|
# Check that partition table format is GPT
|
|
output, _, _ = command(["udevadm", "settle", "-E", disk_node])
|
|
output, _, _ = command(["parted", "-s", disk_node, "print"])
|
|
if not re.search('Partition Table: gpt', output):
|
|
print("Format of disk node %s is not GPT, zapping disk" % disk_node)
|
|
return False
|
|
|
|
# Check each partition size
|
|
partition_index = 1
|
|
for size in partition_sizes:
|
|
# Check that each partition size matches the one in input
|
|
if DEVICE_NAME_NVME in disk_node:
|
|
partition_node = '{}p{}'.format(disk_node, str(partition_index))
|
|
else:
|
|
partition_node = '{}{}'.format(disk_node, str(partition_index))
|
|
|
|
output, _, _ = command(["udevadm", "settle", "-E", partition_node])
|
|
cmd = ["parted", "-s", partition_node, "unit", "MiB", "print"]
|
|
output, _, _ = command(cmd)
|
|
|
|
regex = ("^Disk " + str(partition_node) + ":\\s*" +
|
|
str(size) + "[\\.0]*MiB")
|
|
if not re.search(regex, output, re.MULTILINE):
|
|
print("Journal partition %(node)s size is not %(size)s, "
|
|
"zapping disk" % {"node": partition_node, "size": size})
|
|
return False
|
|
|
|
partition_index += 1
|
|
|
|
output, _, _ = command(["udevadm", "settle", "-t", "10"])
|
|
return True
|
|
|
|
|
|
def create_partitions(disk_path, partition_sizes):
|
|
"""Recreate partitions"""
|
|
|
|
# Obtain the device node from the device path.
|
|
disk_node = device_path_to_device_node(disk_path)
|
|
|
|
# Issue: After creating a new partition table on a device, Udev does not
|
|
# always remove old symlinks (i.e. to previous partitions on that device).
|
|
# Also, even if links are erased before zapping the disk, some of them will
|
|
# be recreated even though there is no partition to back them!
|
|
# Therefore, we have to remove the links AFTER we erase the partition table
|
|
# Issue: DISK_BY_PARTUUID directory is not present at all if there are no
|
|
# GPT partitions on the storage node so nothing to remove in this case
|
|
links = []
|
|
if os.path.isdir(DISK_BY_PARTUUID):
|
|
links = [os.path.join(DISK_BY_PARTUUID, l) for l in os.listdir(DISK_BY_PARTUUID)
|
|
if os.path.islink(os.path.join(DISK_BY_PARTUUID, l))]
|
|
|
|
# Erase all partitions on current node by creating a new GPT table
|
|
_, err, ret = command(["parted", "-s", disk_node, "mktable", "gpt"])
|
|
if ret:
|
|
print("Error erasing partition table of %(node)s\n"
|
|
"Return code: %(ret)s reason: %(reason)s" %
|
|
{"node": disk_node, "ret": ret, "reason": err})
|
|
exit(1)
|
|
|
|
# Erase old symlinks
|
|
for l in links:
|
|
if disk_node in os.path.realpath(l):
|
|
os.remove(l)
|
|
|
|
# Create partitions in order
|
|
used_space_mib = 1 # leave 1 MB at the beginning of the disk
|
|
num = 1
|
|
for size in partition_sizes:
|
|
cmd = ['parted', '-s', disk_node, 'unit', 'mib',
|
|
'mkpart', 'primary',
|
|
str(used_space_mib), str(used_space_mib + size)]
|
|
_, err, ret = command(cmd)
|
|
parms = {"disk_node": disk_node,
|
|
"start": used_space_mib,
|
|
"end": used_space_mib + size,
|
|
"reason": err}
|
|
print("Created partition from start=%(start)s MiB to end=%(end)s MiB"
|
|
" on %(disk_node)s" % parms)
|
|
if ret:
|
|
print("Failed to create partition with "
|
|
"start=%(start)s, end=%(end)s "
|
|
"on %(disk_node)s reason: %(reason)s" % parms)
|
|
exit(1)
|
|
# Set partition type to ceph journal
|
|
# noncritical operation, it makes 'ceph-disk list' output correct info
|
|
cmd = ['sgdisk',
|
|
'--change-name={num}:ceph journal'.format(num=num),
|
|
'--typecode={num}:{uuid}'.format(
|
|
num=num,
|
|
uuid=JOURNAL_UUID,
|
|
),
|
|
disk_node]
|
|
_, err, ret = command(cmd)
|
|
if ret:
|
|
print("WARNINIG: Failed to set partition name and typecode")
|
|
used_space_mib += size
|
|
num += 1
|
|
|
|
|
|
###########################
|
|
# Manage Journal Location #
|
|
###########################
|
|
|
|
OSD_PATH = "/var/lib/ceph/osd/"
|
|
|
|
|
|
def mount_data_partition(data_path, osdid):
|
|
"""Mount an OSD data partition and return the mounted path"""
|
|
|
|
# Obtain the device node from the device path.
|
|
data_node = device_path_to_device_node(data_path)
|
|
|
|
mount_path = OSD_PATH + "ceph-" + str(osdid)
|
|
output, _, _ = command(['mount'])
|
|
regex = "^" + data_node + ".*" + mount_path
|
|
if not re.search(regex, output, re.MULTILINE):
|
|
cmd = ['mount', '-t', 'xfs', data_node, mount_path]
|
|
_, _, ret = command(cmd)
|
|
params = {"node": data_node, "path": mount_path}
|
|
if ret:
|
|
print("Failed to mount %(node)s to %(path), aborting" % params)
|
|
exit(1)
|
|
else:
|
|
print("Mounted %(node)s to %(path)s" % params)
|
|
return mount_path
|
|
|
|
|
|
def is_location_correct(path, journal_path, osdid):
|
|
"""Check if location points to the correct device"""
|
|
|
|
# Obtain the device node from the device path.
|
|
journal_node = device_path_to_device_node(journal_path)
|
|
|
|
cur_node = os.path.realpath(path + "/journal")
|
|
if cur_node == journal_node:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
|
|
def fix_location(mount_point, journal_path, osdid):
|
|
"""Move the journal to the new partition"""
|
|
|
|
# Obtain the device node from the device path.
|
|
journal_node = device_path_to_device_node(journal_path)
|
|
|
|
# Fix symlink
|
|
path = mount_point + "/journal" # 'journal' symlink path used by ceph-osd
|
|
journal_uuid = get_partition_uuid(journal_node)
|
|
new_target = DISK_BY_PARTUUID + journal_uuid
|
|
params = {"path": path, "target": new_target}
|
|
try:
|
|
if os.path.lexists(path):
|
|
os.unlink(path) # delete the old symlink
|
|
os.symlink(new_target, path)
|
|
print("Symlink created: %(path)s -> %(target)s" % params)
|
|
except:
|
|
print("Failed to create symlink: %(path)s -> %(target)s" % params)
|
|
exit(1)
|
|
# Fix journal_uuid
|
|
path = mount_point + "/journal_uuid"
|
|
try:
|
|
with open(path, 'w') as f:
|
|
f.write(journal_uuid)
|
|
except Exception as ex:
|
|
# The operation is noncritical, it only makes 'ceph-disk list'
|
|
# display complete output. We log and continue.
|
|
params = {"path": path, "uuid": journal_uuid}
|
|
print("WARNING: Failed to set uuid of %(path)s to %(uuid)s" % params)
|
|
|
|
# Clean the journal partition
|
|
# even if erasing the partition table, if another journal was present here
|
|
# it's going to be reused. Journals are always bigger than 100MB.
|
|
command(['dd', 'if=/dev/zero', 'of=%s' % journal_node,
|
|
'bs=1M', 'count=100'])
|
|
|
|
# Format the journal
|
|
cmd = ['/usr/bin/ceph-osd', '-i', str(osdid),
|
|
'--pid-file', '/var/run/ceph/osd.%s.pid' % osdid,
|
|
'-c', '/etc/ceph/ceph.conf',
|
|
'--cluster', 'ceph',
|
|
'--mkjournal']
|
|
out, err, ret = command(cmd)
|
|
params = {"journal_node": journal_node,
|
|
"osdid": osdid,
|
|
"ret": ret,
|
|
"reason": err}
|
|
if not ret:
|
|
print("Prepared new journal partition: %(journal_node)s "
|
|
"for osd id: %(osdid)s" % params)
|
|
else:
|
|
print("Error initializing journal node: "
|
|
"%(journal_node)s for osd id: %(osdid)s "
|
|
"ceph-osd return code: %(ret)s reason: %(reason)s" % params)
|
|
|
|
|
|
########
|
|
# Main #
|
|
########
|
|
|
|
def main(argv):
|
|
# parse and validate arguments
|
|
err = False
|
|
partitions = None
|
|
location = None
|
|
if len(argv) != 2:
|
|
err = True
|
|
elif argv[0] == "partitions":
|
|
valid_keys = ['disk_path', 'journals']
|
|
partitions = get_input(argv[1], valid_keys)
|
|
if not partitions:
|
|
err = True
|
|
elif not isinstance(partitions['journals'], list):
|
|
err = True
|
|
elif argv[0] == "location":
|
|
valid_keys = ['data_path', 'journal_path', 'osdid']
|
|
location = get_input(argv[1], valid_keys)
|
|
if not location:
|
|
err = True
|
|
elif not isinstance(location['osdid'], int):
|
|
err = True
|
|
else:
|
|
err = True
|
|
if err:
|
|
print("Command intended for internal use only")
|
|
exit(-1)
|
|
|
|
if partitions:
|
|
# Recreate partitions only if the existing ones don't match input
|
|
if not is_partitioning_correct(partitions['disk_path'],
|
|
partitions['journals']):
|
|
create_partitions(partitions['disk_path'], partitions['journals'])
|
|
else:
|
|
print("Partition table for %s is correct, "
|
|
"no need to repartition" %
|
|
device_path_to_device_node(partitions['disk_path']))
|
|
elif location:
|
|
# we need to have the data partition mounted & we can let it mounted
|
|
mount_point = mount_data_partition(location['data_path'],
|
|
location['osdid'])
|
|
# Update journal location only if link point to another partition
|
|
if not is_location_correct(mount_point,
|
|
location['journal_path'],
|
|
location['osdid']):
|
|
print("Fixing journal location for "
|
|
"OSD id: %(id)s" % {"node": location['data_path'],
|
|
"id": location['osdid']})
|
|
fix_location(mount_point,
|
|
location['journal_path'],
|
|
location['osdid'])
|
|
else:
|
|
print("Journal location for %s is correct,"
|
|
"no need to change it" % location['data_path'])
|
|
|
|
|
|
main(sys.argv[1:])
|