From 8d3b3b2ee0a86b37b6ad1f4d201bfaa0103af07d Mon Sep 17 00:00:00 2001 From: Samuel Merritt Date: Sun, 11 Jan 2015 13:41:35 -0800 Subject: [PATCH] Add some debug output to the ring builder Sometimes, I get handed a builder file in a support ticket and a question of the form "why is the balance [not] doing $thing?". When that happens, I add a bunch of print statements to my local swift/common/ring/builder.py, figure things out, and then delete the print statements. This time, instead of deleting the print statements, I turned them into debug() calls and added a "--debug" flag to the rebalance command in hopes that someone else will find it useful. Change-Id: I697af90984fa5b314ddf570280b4585ba0ba363c --- doc/source/admin_guide.rst | 10 ++++++++ swift/cli/ringbuilder.py | 25 +++++++++++++++----- swift/common/ring/builder.py | 39 +++++++++++++++++++++++++++++-- test/unit/cli/test_ringbuilder.py | 27 +++++++++++++++++++++ 4 files changed, 93 insertions(+), 8 deletions(-) diff --git a/doc/source/admin_guide.rst b/doc/source/admin_guide.rst index d96353c848..5b7a02850a 100644 --- a/doc/source/admin_guide.rst +++ b/doc/source/admin_guide.rst @@ -88,6 +88,16 @@ attempting to write to or read the builder/ring files while operations are in progress. This can be useful in environments where ring management has been automated but the operator still needs to interact with the rings manually. +If the ring builder is not producing the balances that you are +expecting, you can gain visibility into what it's doing with the +``--debug`` flag.:: + + swift-ring-builder rebalance --debug + +This produces a great deal of output that is mostly useful if you are +either (a) attempting to fix the ring builder, or (b) filing a bug +against the ring builder. + ----------------------- Scripting Ring Creation ----------------------- diff --git a/swift/cli/ringbuilder.py b/swift/cli/ringbuilder.py index 0a7dab5331..16315627da 100755 --- a/swift/cli/ringbuilder.py +++ b/swift/cli/ringbuilder.py @@ -14,12 +14,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging + from errno import EEXIST from itertools import islice, izip from operator import itemgetter from os import mkdir from os.path import basename, abspath, dirname, exists, join as pathjoin -from sys import argv as sys_argv, exit, stderr +from sys import argv as sys_argv, exit, stderr, stdout from textwrap import wrap from time import time import optparse @@ -831,6 +833,8 @@ swift-ring-builder rebalance [options] help='Force a rebalanced ring to save even ' 'if < 1% of parts changed') parser.add_option('-s', '--seed', help="seed to use for rebalance") + parser.add_option('-d', '--debug', action='store_true', + help="print debug information") options, args = parser.parse_args(argv) def get_seed(index): @@ -841,6 +845,14 @@ swift-ring-builder rebalance [options] except IndexError: pass + if options.debug: + logger = logging.getLogger("swift.ring.builder") + logger.setLevel(logging.DEBUG) + handler = logging.StreamHandler(stdout) + formatter = logging.Formatter("%(levelname)s: %(message)s") + handler.setFormatter(formatter) + logger.addHandler(handler) + devs_changed = builder.devs_changed try: last_balance = builder.get_balance() @@ -889,11 +901,12 @@ swift-ring-builder rebalance [options] status = EXIT_SUCCESS if builder.dispersion > 0: print '-' * 79 - print('NOTE: Dispersion of %.06f indicates some parts are not\n' - ' optimally dispersed.\n\n' - ' You may want adjust some device weights, increase\n' - ' the overload or review the dispersion report.' % - builder.dispersion) + print( + 'NOTE: Dispersion of %.06f indicates some parts are not\n' + ' optimally dispersed.\n\n' + ' You may want to adjust some device weights, increase\n' + ' the overload or review the dispersion report.' % + builder.dispersion) status = EXIT_WARNING print '-' * 79 elif balance > 5 and balance / 100.0 > builder.overload: diff --git a/swift/common/ring/builder.py b/swift/common/ring/builder.py index b464bd76b2..6672fdbecc 100644 --- a/swift/common/ring/builder.py +++ b/swift/common/ring/builder.py @@ -17,6 +17,7 @@ import bisect import copy import errno import itertools +import logging import math import random import cPickle as pickle @@ -33,6 +34,16 @@ from swift.common.ring.utils import tiers_for_dev, build_tier_tree, \ MAX_BALANCE = 999.99 +try: + # python 2.7+ + from logging import NullHandler +except ImportError: + # python 2.6 + class NullHandler(logging.Handler): + def emit(self, *a, **kw): + pass + + class RingBuilder(object): """ Used to build swift.common.ring.RingData instances to be written to disk @@ -96,6 +107,11 @@ class RingBuilder(object): self._remove_devs = [] self._ring = None + self.logger = logging.getLogger("swift.ring.builder") + if not self.logger.handlers: + # silence "no handler for X" error messages + self.logger.addHandler(NullHandler()) + def weight_of_one_part(self): """ Returns the weight of each partition as calculated from the @@ -355,6 +371,7 @@ class RingBuilder(object): self._ring = None if self._last_part_moves_epoch is None: + self.logger.debug("New builder; performing initial balance") self._initial_balance() self.devs_changed = False self._build_dispersion_graph() @@ -363,16 +380,23 @@ class RingBuilder(object): self._update_last_part_moves() last_balance = 0 new_parts, removed_part_count = self._adjust_replica2part2dev_size() + self.logger.debug( + "%d new parts and %d removed parts from replica-count change", + len(new_parts), removed_part_count) changed_parts += removed_part_count self._set_parts_wanted() self._reassign_parts(new_parts) changed_parts += len(new_parts) while True: reassign_parts = self._gather_reassign_parts() - self._reassign_parts(reassign_parts) changed_parts += len(reassign_parts) + self.logger.debug("Gathered %d parts", changed_parts) + self._reassign_parts(reassign_parts) + self.logger.debug("Assigned %d parts", changed_parts) while self._remove_devs: - self.devs[self._remove_devs.pop()['id']] = None + remove_dev_id = self._remove_devs.pop()['id'] + self.logger.debug("Removing dev %d", remove_dev_id) + self.devs[remove_dev_id] = None balance = self.get_balance() if balance < 1 or abs(last_balance - balance) < 1 or \ changed_parts == self.parts: @@ -786,6 +810,9 @@ class RingBuilder(object): if dev_id in dev_ids: self._last_part_moves[part] = 0 removed_dev_parts[part].append(replica) + self.logger.debug( + "Gathered %d/%d from dev %d [dev removed]", + part, replica, dev_id) # Now we gather partitions that are "at risk" because they aren't # currently sufficient spread out across the cluster. @@ -859,6 +886,9 @@ class RingBuilder(object): dev['parts'] -= 1 removed_replica = True moved_parts += 1 + self.logger.debug( + "Gathered %d/%d from dev %d [dispersion]", + part, replica, dev['id']) break if removed_replica: for tier in tfd[dev['id']]: @@ -894,6 +924,9 @@ class RingBuilder(object): dev['parts_wanted'] += 1 dev['parts'] -= 1 reassign_parts[part].append(replica) + self.logger.debug( + "Gathered %d/%d from dev %d [weight]", + part, replica, dev['id']) reassign_parts.update(spread_out_parts) reassign_parts.update(removed_dev_parts) @@ -1121,6 +1154,8 @@ class RingBuilder(object): new_index, new_last_sort_key) self._replica2part2dev[replica][part] = dev['id'] + self.logger.debug( + "Placed %d/%d onto dev %d", part, replica, dev['id']) # Just to save memory and keep from accidental reuse. for dev in self._iter_devs(): diff --git a/test/unit/cli/test_ringbuilder.py b/test/unit/cli/test_ringbuilder.py index 38991bfe4a..8b96f25165 100644 --- a/test/unit/cli/test_ringbuilder.py +++ b/test/unit/cli/test_ringbuilder.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging import mock import os import StringIO @@ -1744,6 +1745,32 @@ class TestRebalanceCommand(unittest.TestCase, RunSwiftRingBuilderMixin): raise return (mock_stdout.getvalue(), mock_stderr.getvalue()) + def test_debug(self): + # NB: getLogger(name) always returns the same object + rb_logger = logging.getLogger("swift.ring.builder") + try: + self.assertNotEqual(rb_logger.getEffectiveLevel(), logging.DEBUG) + + self.run_srb("create", 8, 3, 1) + self.run_srb("add", + "r1z1-10.1.1.1:2345/sda", 100.0, + "r1z1-10.1.1.1:2345/sdb", 100.0, + "r1z1-10.1.1.1:2345/sdc", 100.0, + "r1z1-10.1.1.1:2345/sdd", 100.0) + self.run_srb("rebalance", "--debug") + self.assertEqual(rb_logger.getEffectiveLevel(), logging.DEBUG) + + rb_logger.setLevel(logging.INFO) + self.run_srb("rebalance", "--debug", "123") + self.assertEqual(rb_logger.getEffectiveLevel(), logging.DEBUG) + + rb_logger.setLevel(logging.INFO) + self.run_srb("rebalance", "123", "--debug") + self.assertEqual(rb_logger.getEffectiveLevel(), logging.DEBUG) + + finally: + rb_logger.setLevel(logging.INFO) # silence other test cases + def test_rebalance_warning_appears(self): self.run_srb("create", 8, 3, 24) # all in one machine: totally balanceable