diff --git a/.gitignore b/.gitignore index 32a80896..4219e517 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,6 @@ .testrepository bin *.sw[nop] -.idea *.pyc .unit-state.db +.idea diff --git a/config.yaml b/config.yaml index 6253bcf2..b54e82a0 100644 --- a/config.yaml +++ b/config.yaml @@ -164,7 +164,8 @@ options: sysctl: type: string default: '{ kernel.pid_max : 2097152, vm.max_map_count : 524288, - kernel.threads-max: 2097152 }' + kernel.threads-max: 2097152, vm.vfs_cache_pressure: 1, + vm.swappiness: 1 }' description: | YAML-formatted associative array of sysctl key/value pairs to be set persistently. By default we set pid_max, max_map_count and @@ -177,6 +178,14 @@ options: description: | Setting this to true will tell Ceph to replicate across Juju's Availability Zone instead of specifically by host. + max-sectors-kb: + default: 1048576 + type: int + description: | + This parameter will adjust every block device in your server to allow + greater IO operation sizes. If you have a RAID card with cache on it + consider tuning this much higher than the 1MB default. 1MB is a safe + default for spinning HDDs that don't have much cache. nagios_context: type: string default: "juju" @@ -204,4 +213,13 @@ options: description: | Apply system hardening. Supports a space-delimited list of modules to run. Supported modules currently include os, ssh, apache and mysql. + autotune: + default: False + type: boolean + description: | + Enabling this option will attempt to tune your network card sysctls and + hard drive settings. This changes hard drive read ahead settings and + max_sectors_kb. For the network card this will detect the link speed + and make appropriate sysctl changes. Enabling this option should + generally be safe. diff --git a/hooks/ceph.py b/hooks/ceph.py index 22e6c9af..edb4f2e6 100644 --- a/hooks/ceph.py +++ b/hooks/ceph.py @@ -23,6 +23,7 @@ import re import sys import shutil from charmhelpers.cli.host import mounts +from charmhelpers.core import hookenv from charmhelpers.core.host import ( mkdir, chownr, @@ -48,7 +49,7 @@ from charmhelpers.contrib.storage.linux.utils import ( ) from utils import ( get_unit_hostname, -) + render_template) LEADER = 'leader' PEON = 'peon' @@ -56,6 +57,333 @@ QUORUM = [LEADER, PEON] PACKAGES = ['ceph', 'gdisk', 'ntp', 'btrfs-tools', 'python-ceph', 'xfsprogs'] +LinkSpeed = { + "BASE_10": 10, + "BASE_100": 100, + "BASE_1000": 1000, + "GBASE_10": 10000, + "GBASE_40": 40000, + "GBASE_100": 100000, + "UNKNOWN": None +} + +# Mapping of adapter speed to sysctl settings +NETWORK_ADAPTER_SYSCTLS = { + # 10Gb + LinkSpeed["GBASE_10"]: { + 'net.core.rmem_default': 524287, + 'net.core.wmem_default': 524287, + 'net.core.rmem_max': 524287, + 'net.core.wmem_max': 524287, + 'net.core.optmem_max': 524287, + 'net.core.netdev_max_backlog': 300000, + 'net.ipv4.tcp_rmem': '10000000 10000000 10000000', + 'net.ipv4.tcp_wmem': '10000000 10000000 10000000', + 'net.ipv4.tcp_mem': '10000000 10000000 10000000' + }, + # Mellanox 10/40Gb + LinkSpeed["GBASE_40"]: { + 'net.ipv4.tcp_timestamps': 0, + 'net.ipv4.tcp_sack': 1, + 'net.core.netdev_max_backlog': 250000, + 'net.core.rmem_max': 4194304, + 'net.core.wmem_max': 4194304, + 'net.core.rmem_default': 4194304, + 'net.core.wmem_default': 4194304, + 'net.core.optmem_max': 4194304, + 'net.ipv4.tcp_rmem': '4096 87380 4194304', + 'net.ipv4.tcp_wmem': '4096 65536 4194304', + 'net.ipv4.tcp_low_latency': 1, + 'net.ipv4.tcp_adv_win_scale': 1 + } +} + + +def save_sysctls(sysctl_dict, save_location): + """ + Persist the sysctls to the hard drive. + :param sysctl_dict: dict + :param save_location: path to save the settings to + :raise: IOError if anything goes wrong with writing. + """ + try: + # Persist the settings for reboots + with open(save_location, "w") as fd: + for key, value in sysctl_dict.items(): + fd.write("{}={}\n".format(key, value)) + + except IOError as e: + log("Unable to persist sysctl settings to {}. Error {}".format( + save_location, e.message), level=ERROR) + raise + + +def tune_nic(network_interface): + """ + This will set optimal sysctls for the particular network adapter. + :param network_interface: string The network adapter name. + """ + speed = get_link_speed(network_interface) + if speed in NETWORK_ADAPTER_SYSCTLS: + status_set('maintenance', 'Tuning device {}'.format( + network_interface)) + sysctl_file = os.path.join( + os.sep, + 'etc', + 'sysctl.d', + '51-ceph-osd-charm-{}.conf'.format(network_interface)) + try: + log("Saving sysctl_file: {} values: {}".format( + sysctl_file, NETWORK_ADAPTER_SYSCTLS[speed]), + level=DEBUG) + save_sysctls(sysctl_dict=NETWORK_ADAPTER_SYSCTLS[speed], + save_location=sysctl_file) + except IOError as e: + log("Write to /etc/sysctl.d/51-ceph-osd-charm-{} " + "failed. {}".format(network_interface, e.message), + level=ERROR) + + try: + # Apply the settings + log("Applying sysctl settings", level=DEBUG) + subprocess.check_output(["sysctl", "-p", sysctl_file]) + except subprocess.CalledProcessError as err: + log('sysctl -p {} failed with error {}'.format(sysctl_file, + err.output), + level=ERROR) + else: + log("No settings found for network adapter: {}".format( + network_interface), level=DEBUG) + + +def get_link_speed(network_interface): + """ + This will find the link speed for a given network device. Returns None + if an error occurs. + :param network_interface: string The network adapter interface. + :return: LinkSpeed + """ + speed_path = os.path.join(os.sep, 'sys', 'class', 'net', + network_interface, 'speed') + # I'm not sure where else we'd check if this doesn't exist + if not os.path.exists(speed_path): + return LinkSpeed["UNKNOWN"] + + try: + with open(speed_path, 'r') as sysfs: + nic_speed = sysfs.readlines() + + # Did we actually read anything? + if not nic_speed: + return LinkSpeed["UNKNOWN"] + + # Try to find a sysctl match for this particular speed + for name, speed in LinkSpeed.items(): + if speed == int(nic_speed[0].strip()): + return speed + # Default to UNKNOWN if we can't find a match + return LinkSpeed["UNKNOWN"] + except IOError as e: + log("Unable to open {path} because of error: {error}".format( + path=speed_path, + error=e.message), level='error') + return LinkSpeed["UNKNOWN"] + + +def persist_settings(settings_dict): + # Write all settings to /etc/hdparm.conf + """ + This will persist the hard drive settings to the /etc/hdparm.conf file + The settings_dict should be in the form of {"uuid": {"key":"value"}} + :param settings_dict: dict of settings to save + """ + hdparm_path = os.path.join(os.sep, 'etc', 'hdparm.conf') + try: + with open(hdparm_path, 'w') as hdparm: + hdparm.write(render_template('hdparm.conf', settings_dict)) + except IOError as err: + log("Unable to open {path} because of error: {error}".format( + path=hdparm_path, + error=err.message), level=ERROR) + + +def set_max_sectors_kb(dev_name, max_sectors_size): + """ + This function sets the max_sectors_kb size of a given block device. + :param dev_name: Name of the block device to query + :param max_sectors_size: int of the max_sectors_size to save + """ + max_sectors_kb_path = os.path.join('sys', 'block', dev_name, 'queue', + 'max_sectors_kb') + try: + with open(max_sectors_kb_path, 'w') as f: + f.write(max_sectors_size) + except IOError as e: + log('Failed to write max_sectors_kb to {}. Error: {}'.format( + max_sectors_kb_path, e.message), level=ERROR) + + +def get_max_sectors_kb(dev_name): + """ + This function gets the max_sectors_kb size of a given block device. + :param dev_name: Name of the block device to query + :return: int which is either the max_sectors_kb or 0 on error. + """ + max_sectors_kb_path = os.path.join('sys', 'block', dev_name, 'queue', + 'max_sectors_kb') + + # Read in what Linux has set by default + if os.path.exists(max_sectors_kb_path): + try: + with open(max_sectors_kb_path, 'r') as f: + max_sectors_kb = f.read().strip() + return int(max_sectors_kb) + except IOError as e: + log('Failed to read max_sectors_kb to {}. Error: {}'.format( + max_sectors_kb_path, e.message), level=ERROR) + # Bail. + return 0 + return 0 + + +def get_max_hw_sectors_kb(dev_name): + """ + This function gets the max_hw_sectors_kb for a given block device. + :param dev_name: Name of the block device to query + :return: int which is either the max_hw_sectors_kb or 0 on error. + """ + max_hw_sectors_kb_path = os.path.join('sys', 'block', dev_name, 'queue', + 'max_hw_sectors_kb') + # Read in what the hardware supports + if os.path.exists(max_hw_sectors_kb_path): + try: + with open(max_hw_sectors_kb_path, 'r') as f: + max_hw_sectors_kb = f.read().strip() + return int(max_hw_sectors_kb) + except IOError as e: + log('Failed to read max_hw_sectors_kb to {}. Error: {}'.format( + max_hw_sectors_kb_path, e.message), level=ERROR) + return 0 + return 0 + + +def set_hdd_read_ahead(dev_name, read_ahead_sectors=256): + """ + This function sets the hard drive read ahead. + :param dev_name: Name of the block device to set read ahead on. + :param read_ahead_sectors: int How many sectors to read ahead. + """ + try: + # Set the read ahead sectors to 256 + log('Setting read ahead to {} for device {}'.format( + read_ahead_sectors, + dev_name)) + subprocess.check_output(['hdparm', + '-a{}'.format(read_ahead_sectors), + dev_name]) + except subprocess.CalledProcessError as e: + log('hdparm failed with error: {}'.format(e.output), + level=ERROR) + + +def get_block_uuid(block_dev): + """ + This queries blkid to get the uuid for a block device. + :param block_dev: Name of the block device to query. + :return: The UUID of the device or None on Error. + """ + try: + block_info = subprocess.check_output( + ['blkid', '-o', 'export', block_dev]) + for tag in block_info.split('\n'): + parts = tag.split('=') + if parts[0] == 'UUID': + return parts[1] + return None + except subprocess.CalledProcessError as err: + log('get_block_uuid failed with error: {}'.format(err.output), + level=ERROR) + return None + + +def check_max_sectors(save_settings_dict, + block_dev, + uuid): + """ + Tune the max_hw_sectors if needed. + make sure that /sys/.../max_sectors_kb matches max_hw_sectors_kb or at + least 1MB for spinning disks + If the box has a RAID card with cache this could go much bigger. + :param save_settings_dict: The dict used to persist settings + :param block_dev: A block device name: Example: /dev/sda + :param uuid: The uuid of the block device + """ + dev_name = None + path_parts = os.path.split(block_dev) + if len(path_parts) == 2: + dev_name = path_parts[1] + else: + log('Unable to determine the block device name from path: {}'.format( + block_dev)) + # Play it safe and bail + return + max_sectors_kb = get_max_sectors_kb(dev_name=dev_name) + max_hw_sectors_kb = get_max_hw_sectors_kb(dev_name=dev_name) + + if max_sectors_kb < max_hw_sectors_kb: + # OK we have a situation where the hardware supports more than Linux is + # currently requesting + config_max_sectors_kb = hookenv.config('max-sectors-kb') + if config_max_sectors_kb < max_hw_sectors_kb: + # Set the max_sectors_kb to the config.yaml value if it is less + # than the max_hw_sectors_kb + log('Setting max_sectors_kb for device {} to {}'.format( + dev_name, config_max_sectors_kb)) + save_settings_dict[ + "drive_settings"][uuid][ + "read_ahead_sect"] = config_max_sectors_kb + set_max_sectors_kb(dev_name=dev_name, + max_sectors_size=config_max_sectors_kb) + else: + # Set to the max_hw_sectors_kb + log('Setting max_sectors_kb for device {} to {}'.format( + dev_name, max_hw_sectors_kb)) + save_settings_dict[ + "drive_settings"][uuid]['read_ahead_sect'] = max_hw_sectors_kb + set_max_sectors_kb(dev_name=dev_name, + max_sectors_size=max_hw_sectors_kb) + else: + log('max_sectors_kb match max_hw_sectors_kb. No change needed for ' + 'device: {}'.format(block_dev)) + + +def tune_dev(block_dev): + """ + Try to make some intelligent decisions with HDD tuning. Future work will + include optimizing SSDs. + This function will change the read ahead sectors and the max write + sectors for each block device. + :param block_dev: A block device name: Example: /dev/sda + """ + uuid = get_block_uuid(block_dev) + if uuid is None: + log('block device {} uuid is None. Unable to save to ' + 'hdparm.conf'.format(block_dev), level=DEBUG) + save_settings_dict = {} + log('Tuning device {}'.format(block_dev)) + status_set('maintenance', 'Tuning device {}'.format(block_dev)) + set_hdd_read_ahead(block_dev) + save_settings_dict["drive_settings"] = {} + save_settings_dict["drive_settings"][uuid] = {} + save_settings_dict["drive_settings"][uuid]['read_ahead_sect'] = 256 + + check_max_sectors(block_dev=block_dev, + save_settings_dict=save_settings_dict, + uuid=uuid) + + persist_settings(settings_dict=save_settings_dict) + status_set('maintenance', 'Finished tuning device {}'.format(block_dev)) + def ceph_user(): if get_version() > 1: diff --git a/hooks/ceph_hooks.py b/hooks/ceph_hooks.py index e7f1bc50..d6240fbb 100755 --- a/hooks/ceph_hooks.py +++ b/hooks/ceph_hooks.py @@ -13,7 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import os import random import subprocess @@ -21,6 +20,7 @@ import sys import tempfile import socket import time +import netifaces import ceph from charmhelpers.core import hookenv @@ -270,12 +270,24 @@ def upgrade_osd(): sys.exit(1) +def tune_network_adapters(): + interfaces = netifaces.interfaces() + for interface in interfaces: + if interface == "lo": + # Skip the loopback + continue + log("Looking up {} for possible sysctl tuning.".format(interface)) + ceph.tune_nic(interface) + + @hooks.hook('install.real') @harden() def install(): add_source(config('source'), config('key')) apt_update(fatal=True) apt_install(packages=ceph.PACKAGES, fatal=True) + if config('autotune'): + tune_network_adapters() def az_info(): @@ -440,6 +452,9 @@ def prepare_disks_and_activate(): osd_journal, config('osd-reformat'), config('ignore-device-errors'), config('osd-encrypt')) + # Make it fast! + if config('autotune'): + ceph.tune_dev(dev) ceph.start_osds(get_devices()) diff --git a/templates/hdparm.conf b/templates/hdparm.conf new file mode 100644 index 00000000..f0a4d59b --- /dev/null +++ b/templates/hdparm.conf @@ -0,0 +1,7 @@ +{% for uuid,settings in drive_settings.items() %} + /dev/disk/by-uuid/{{ uuid }} { + {% for key, value in settings.items() %} + {{ key }} = {{ value }} + {% endfor %} + } +{% endfor %} \ No newline at end of file diff --git a/unit_tests/test_tuning.py b/unit_tests/test_tuning.py new file mode 100644 index 00000000..61a69443 --- /dev/null +++ b/unit_tests/test_tuning.py @@ -0,0 +1,125 @@ +__author__ = 'Chris Holcombe ' +from mock import patch, call +import test_utils +import ceph + +TO_PATCH = [ + 'hookenv', + 'status_set', + 'subprocess', + 'log', +] + + +class PerformanceTestCase(test_utils.CharmTestCase): + def setUp(self): + super(PerformanceTestCase, self).setUp(ceph, TO_PATCH) + + def test_tune_nic(self): + with patch('ceph.get_link_speed', return_value=10000): + with patch('ceph.save_sysctls') as save_sysctls: + ceph.tune_nic('eth0') + save_sysctls.assert_has_calls( + [ + call( + save_location='/etc/sysctl.d/' + '51-ceph-osd-charm-eth0.conf', + sysctl_dict={ + 'net.core.rmem_max': 524287, + 'net.core.wmem_max': 524287, + 'net.core.rmem_default': 524287, + 'net.ipv4.tcp_wmem': + '10000000 10000000 10000000', + 'net.core.netdev_max_backlog': 300000, + 'net.core.optmem_max': 524287, + 'net.ipv4.tcp_mem': + '10000000 10000000 10000000', + 'net.ipv4.tcp_rmem': + '10000000 10000000 10000000', + 'net.core.wmem_default': 524287}) + ]) + self.status_set.assert_has_calls( + [ + call('maintenance', 'Tuning device eth0'), + ]) + + def test_get_block_uuid(self): + self.subprocess.check_output.return_value = \ + 'UUID=378f3c86-b21a-4172-832d-e2b3d4bc7511\nTYPE=ext2\n' + uuid = ceph.get_block_uuid('/dev/sda1') + self.assertEqual(uuid, '378f3c86-b21a-4172-832d-e2b3d4bc7511') + + @patch('ceph.persist_settings') + @patch('ceph.set_hdd_read_ahead') + @patch('ceph.get_max_sectors_kb') + @patch('ceph.get_max_hw_sectors_kb') + @patch('ceph.set_max_sectors_kb') + @patch('ceph.get_block_uuid') + def test_tune_dev(self, + block_uuid, + set_max_sectors_kb, + get_max_hw_sectors_kb, + get_max_sectors_kb, + set_hdd_read_ahead, + persist_settings): + self.hookenv.config.return_value = 712 + block_uuid.return_value = '378f3c86-b21a-4172-832d-e2b3d4bc7511' + set_hdd_read_ahead.return_value = None + get_max_sectors_kb.return_value = 512 + get_max_hw_sectors_kb.return_value = 1024 + ceph.tune_dev('/dev/sda') + # The config value was lower than the hardware value. + # We use the lower value. The user wants 712 but the hw supports + # 1K + set_max_sectors_kb.assert_called_with( + dev_name='sda', max_sectors_size=712 + ) + persist_settings.assert_called_with( + settings_dict={'drive_settings': { + '378f3c86-b21a-4172-832d-e2b3d4bc7511': { + 'read_ahead_sect': 712}}} + ) + self.status_set.assert_has_calls([ + call('maintenance', 'Tuning device /dev/sda'), + call('maintenance', 'Finished tuning device /dev/sda') + ]) + + @patch('ceph.persist_settings') + @patch('ceph.set_hdd_read_ahead') + @patch('ceph.get_max_sectors_kb') + @patch('ceph.get_max_hw_sectors_kb') + @patch('ceph.set_max_sectors_kb') + @patch('ceph.get_block_uuid') + def test_tune_dev_2(self, + block_uuid, + set_max_sectors_kb, + get_max_hw_sectors_kb, + get_max_sectors_kb, + set_hdd_read_ahead, + persist_settings): + self.hookenv.config.return_value = 2048 + block_uuid.return_value = '378f3c86-b21a-4172-832d-e2b3d4bc7511' + set_hdd_read_ahead.return_value = None + get_max_sectors_kb.return_value = 512 + get_max_hw_sectors_kb.return_value = 1024 + ceph.tune_dev('/dev/sda') + # The config value was higher than the hardware value. + # We use the lower value. The user wants 2K but the hw only support 1K + set_max_sectors_kb.assert_called_with( + dev_name='sda', max_sectors_size=1024 + ) + persist_settings.assert_called_with( + settings_dict={'drive_settings': { + '378f3c86-b21a-4172-832d-e2b3d4bc7511': { + 'read_ahead_sect': 1024}}} + ) + self.status_set.assert_has_calls([ + call('maintenance', 'Tuning device /dev/sda'), + call('maintenance', 'Finished tuning device /dev/sda') + ]) + + def test_set_hdd_read_ahead(self): + ceph.set_hdd_read_ahead(dev_name='/dev/sda') + self.subprocess.check_output.assert_called_with( + ['hdparm', '-a256', '/dev/sda'] + )