From 8801b7409030a1fb36fefcabd38af190bcb38c95 Mon Sep 17 00:00:00 2001 From: Darrell Bishop Date: Sat, 19 Jan 2013 15:25:27 -0800 Subject: [PATCH] Make statsd sample rate behave better. As Dieter pointed out in bug 1090495 (https://bugs.launchpad.net/swift/+bug/1090495), the volume of metrics can vary wildly between StatsD metrics. This patch implements a partial solution by reducing the sample_rate used for known high-volume metrics (operational experience will need to inform this over time) and introducing a new tunable, log_statsd_sample_rate_factor which is multiplied by the sample_rate for every statsd stat. This tunable can be used to reduce StatsD traffic proportionally for all metrics and is intended to replace log_statsd_default_sample_rate, which is left alone for backward-compatibility, should anyone be using it. This patch also includes a drive-by fix for log_udp_port which wasn't being converted to an int (I didn't verify that actually causes trouble in SysLogHandler(), but it's definitely an improvement regardles). Change-Id: Id404636e3629f6431cf1c4e64a143959750a3c23 --- doc/source/admin_guide.rst | 24 +++++- etc/account-server.conf-sample | 3 +- etc/container-server.conf-sample | 3 +- etc/object-expirer.conf-sample | 3 +- etc/object-server.conf-sample | 3 +- etc/proxy-server.conf-sample | 6 +- swift/account/server.py | 12 +-- swift/common/middleware/proxy_logging.py | 1 + swift/common/utils.py | 57 +++++++------ swift/container/server.py | 12 +-- swift/obj/server.py | 12 +-- test/unit/common/test_utils.py | 102 ++++++++++++++++++++++- 12 files changed, 183 insertions(+), 55 deletions(-) diff --git a/doc/source/admin_guide.rst b/doc/source/admin_guide.rst index b1d00fa364..40fdcc5c7a 100644 --- a/doc/source/admin_guide.rst +++ b/doc/source/admin_guide.rst @@ -407,7 +407,8 @@ configuration entries (see the sample configuration files):: log_statsd_host = localhost log_statsd_port = 8125 - log_statsd_default_sample_rate = 1 + log_statsd_default_sample_rate = 1.0 + log_statsd_sample_rate_factor = 1.0 log_statsd_metric_prefix = [empty-string] If `log_statsd_host` is not set, this feature is disabled. The default values @@ -422,9 +423,24 @@ probability of sending a sample for any given event or timing measurement. This sample rate is sent with each sample to StatsD and used to multiply the value. For example, with a sample rate of 0.5, StatsD will multiply that counter's value by 2 when flushing the metric to an upstream -monitoring system (Graphite_, Ganglia_, etc.). To get the best data, start -with the default `log_statsd_default_sample_rate` value of 1 and only lower -it as needed. +monitoring system (Graphite_, Ganglia_, etc.). + +Some relatively high-frequency metrics have a default sample rate less than +one. If you want to override the default sample rate for all metrics whose +default sample rate is not specified in the Swift source, you may set +`log_statsd_default_sample_rate` to a value less than one. This is NOT +recommended (see next paragraph). A better way to reduce StatsD load is to +adjust `log_statsd_sample_rate_factor` to a value less than one. The +`log_statsd_sample_rate_factor` is multiplied to any sample rate (either the +global default or one specified by the actual metric logging call in the Swift +source) prior to handling. In other words, this one tunable can lower the +frequency of all StatsD logging by a proportional amount. + +To get the best data, start with the default `log_statsd_default_sample_rate` +and `log_statsd_sample_rate_factor` values of 1 and only lower +`log_statsd_sample_rate_factor` if needed. The +`log_statsd_default_sample_rate` should not be used and remains for backward +compatibility only. The metric prefix will be prepended to every metric sent to the StatsD server For example, with:: diff --git a/etc/account-server.conf-sample b/etc/account-server.conf-sample index 5cb0d2897e..32c5c56723 100644 --- a/etc/account-server.conf-sample +++ b/etc/account-server.conf-sample @@ -24,7 +24,8 @@ # You can enable StatsD logging here: # log_statsd_host = localhost # log_statsd_port = 8125 -# log_statsd_default_sample_rate = 1 +# log_statsd_default_sample_rate = 1.0 +# log_statsd_sample_rate_factor = 1.0 # log_statsd_metric_prefix = # If you don't mind the extra disk space usage in overhead, you can turn this # on to preallocate disk space with SQLite databases to decrease fragmentation. diff --git a/etc/container-server.conf-sample b/etc/container-server.conf-sample index 728fa4cd3c..fdf8301ae5 100644 --- a/etc/container-server.conf-sample +++ b/etc/container-server.conf-sample @@ -27,7 +27,8 @@ # You can enable StatsD logging here: # log_statsd_host = localhost # log_statsd_port = 8125 -# log_statsd_default_sample_rate = 1 +# log_statsd_default_sample_rate = 1.0 +# log_statsd_sample_rate_factor = 1.0 # log_statsd_metric_prefix = # If you don't mind the extra disk space usage in overhead, you can turn this # on to preallocate disk space with SQLite databases to decrease fragmentation. diff --git a/etc/object-expirer.conf-sample b/etc/object-expirer.conf-sample index cef0f0f19d..f6062586a9 100644 --- a/etc/object-expirer.conf-sample +++ b/etc/object-expirer.conf-sample @@ -16,7 +16,8 @@ # You can enable StatsD logging here: # log_statsd_host = localhost # log_statsd_port = 8125 -# log_statsd_default_sample_rate = 1 +# log_statsd_default_sample_rate = 1.0 +# log_statsd_sample_rate_factor = 1.0 # log_statsd_metric_prefix = [object-expirer] diff --git a/etc/object-server.conf-sample b/etc/object-server.conf-sample index 2e8b54f4eb..2341bed898 100644 --- a/etc/object-server.conf-sample +++ b/etc/object-server.conf-sample @@ -25,7 +25,8 @@ # You can enable StatsD logging here: # log_statsd_host = localhost # log_statsd_port = 8125 -# log_statsd_default_sample_rate = 1 +# log_statsd_default_sample_rate = 1.0 +# log_statsd_sample_rate_factor = 1.0 # log_statsd_metric_prefix = # eventlet_debug = false diff --git a/etc/proxy-server.conf-sample b/etc/proxy-server.conf-sample index 7a9254de37..ed994ed166 100644 --- a/etc/proxy-server.conf-sample +++ b/etc/proxy-server.conf-sample @@ -26,7 +26,8 @@ # You can enable StatsD logging here: # log_statsd_host = localhost # log_statsd_port = 8125 -# log_statsd_default_sample_rate = 1 +# log_statsd_default_sample_rate = 1.0 +# log_statsd_sample_rate_factor = 1.0 # log_statsd_metric_prefix = # Use a comma separated list of full url (http://foo.bar:1234,https://foo.bar) # cors_allow_origin = @@ -328,7 +329,8 @@ use = egg:swift#proxy_logging # You can use log_statsd_* from [DEFAULT] or override them here: # access_log_statsd_host = localhost # access_log_statsd_port = 8125 -# access_log_statsd_default_sample_rate = 1 +# access_log_statsd_default_sample_rate = 1.0 +# access_log_statsd_sample_rate_factor = 1.0 # access_log_statsd_metric_prefix = # access_log_headers = False # What HTTP methods are allowed for StatsD logging (comma-sep); request methods diff --git a/swift/account/server.py b/swift/account/server.py index cd6484eb41..91bb8c2449 100644 --- a/swift/account/server.py +++ b/swift/account/server.py @@ -63,7 +63,7 @@ class AccountController(object): return AccountBroker(db_path, account=account, logger=self.logger) @public - @timing_stats + @timing_stats() def DELETE(self, req): """Handle HTTP DELETE request.""" try: @@ -85,7 +85,7 @@ class AccountController(object): return HTTPNoContent(request=req) @public - @timing_stats + @timing_stats() def PUT(self, req): """Handle HTTP PUT request.""" try: @@ -141,7 +141,7 @@ class AccountController(object): return HTTPAccepted(request=req) @public - @timing_stats + @timing_stats() def HEAD(self, req): """Handle HTTP HEAD request.""" # TODO(refactor): The account server used to provide a 'account and @@ -189,7 +189,7 @@ class AccountController(object): return HTTPNoContent(request=req, headers=headers, charset='utf-8') @public - @timing_stats + @timing_stats() def GET(self, req): """Handle HTTP GET request.""" try: @@ -277,7 +277,7 @@ class AccountController(object): return ret @public - @timing_stats + @timing_stats() def REPLICATE(self, req): """ Handle HTTP REPLICATE request. @@ -301,7 +301,7 @@ class AccountController(object): return ret @public - @timing_stats + @timing_stats() def POST(self, req): """Handle HTTP POST request.""" try: diff --git a/swift/common/middleware/proxy_logging.py b/swift/common/middleware/proxy_logging.py index 4e3494b186..2f17a6efce 100644 --- a/swift/common/middleware/proxy_logging.py +++ b/swift/common/middleware/proxy_logging.py @@ -68,6 +68,7 @@ class ProxyLoggingMiddleware(object): for key in ('log_facility', 'log_name', 'log_level', 'log_udp_host', 'log_udp_port', 'log_statsd_host', 'log_statsd_port', 'log_statsd_default_sample_rate', + 'log_statsd_sample_rate_factor', 'log_statsd_metric_prefix'): value = conf.get('access_' + key, conf.get(key, None)) if value: diff --git a/swift/common/utils.py b/swift/common/utils.py index 233c3c96af..62740d57c4 100644 --- a/swift/common/utils.py +++ b/swift/common/utils.py @@ -411,12 +411,13 @@ class LoggerFileObject(object): class StatsdClient(object): def __init__(self, host, port, base_prefix='', tail_prefix='', - default_sample_rate=1): + default_sample_rate=1, sample_rate_factor=1): self._host = host self._port = port self._base_prefix = base_prefix self.set_prefix(tail_prefix) self._default_sample_rate = default_sample_rate + self._sample_rate_factor = sample_rate_factor self._target = (self._host, self._port) self.random = random @@ -433,6 +434,7 @@ class StatsdClient(object): def _send(self, m_name, m_value, m_type, sample_rate): if sample_rate is None: sample_rate = self._default_sample_rate + sample_rate = sample_rate * self._sample_rate_factor parts = ['%s%s:%s' % (self._prefix, m_name, m_value), m_type] if sample_rate < 1: if self.random() < sample_rate: @@ -464,25 +466,30 @@ class StatsdClient(object): sample_rate) -def timing_stats(func): +def timing_stats(**dec_kwargs): """ - Decorator that logs timing events or errors for public methods in swift's - wsgi server controllers, based on response code. + Returns a decorator that logs timing events or errors for public methods in + swift's wsgi server controllers, based on response code. """ - method = func.func_name + def decorating_func(func): + method = func.func_name - @functools.wraps(func) - def _timing_stats(ctrl, *args, **kwargs): - start_time = time.time() - resp = func(ctrl, *args, **kwargs) - if is_success(resp.status_int) or is_redirection(resp.status_int) or \ - resp.status_int == HTTP_NOT_FOUND: - ctrl.logger.timing_since(method + '.timing', start_time) - else: - ctrl.logger.timing_since(method + '.errors.timing', start_time) - return resp + @functools.wraps(func) + def _timing_stats(ctrl, *args, **kwargs): + start_time = time.time() + resp = func(ctrl, *args, **kwargs) + if is_success(resp.status_int) or \ + is_redirection(resp.status_int) or \ + resp.status_int == HTTP_NOT_FOUND: + ctrl.logger.timing_since(method + '.timing', + start_time, **dec_kwargs) + else: + ctrl.logger.timing_since(method + '.errors.timing', + start_time, **dec_kwargs) + return resp - return _timing_stats + return _timing_stats + return decorating_func # double inheritance to support property with setter @@ -590,14 +597,11 @@ class LogAdapter(logging.LoggerAdapter, object): def statsd_delegate(statsd_func_name): """ - Factory which creates methods which delegate to methods on + Factory to create methods which delegate to methods on self.logger.statsd_client (an instance of StatsdClient). The created methods conditionally delegate to a method whose name is given in 'statsd_func_name'. The created delegate methods are a no-op when - StatsD logging is not configured. The created delegate methods also - handle the defaulting of sample_rate (to either the default specified - in the config with 'log_statsd_default_sample_rate' or the value passed - into delegate function). + StatsD logging is not configured. :param statsd_func_name: the name of a method on StatsdClient. """ @@ -655,7 +659,8 @@ def get_logger(conf, name=None, log_to_console=False, log_route=None, log_address = /dev/log log_statsd_host = (disabled) log_statsd_port = 8125 - log_statsd_default_sample_rate = 1 + log_statsd_default_sample_rate = 1.0 + log_statsd_sample_rate_factor = 1.0 log_statsd_metric_prefix = (empty-string) :param conf: Configuration dict to read settings from @@ -687,7 +692,8 @@ def get_logger(conf, name=None, log_to_console=False, log_route=None, SysLogHandler.LOG_LOCAL0) udp_host = conf.get('log_udp_host') if udp_host: - udp_port = conf.get('log_udp_port', logging.handlers.SYSLOG_UDP_PORT) + udp_port = int(conf.get('log_udp_port', + logging.handlers.SYSLOG_UDP_PORT)) handler = SysLogHandler(address=(udp_host, udp_port), facility=facility) else: @@ -727,8 +733,11 @@ def get_logger(conf, name=None, log_to_console=False, log_route=None, base_prefix = conf.get('log_statsd_metric_prefix', '') default_sample_rate = float(conf.get( 'log_statsd_default_sample_rate', 1)) + sample_rate_factor = float(conf.get( + 'log_statsd_sample_rate_factor', 1)) statsd_client = StatsdClient(statsd_host, statsd_port, base_prefix, - name, default_sample_rate) + name, default_sample_rate, + sample_rate_factor) logger.statsd_client = statsd_client else: logger.statsd_client = None diff --git a/swift/container/server.py b/swift/container/server.py index 9e699bf500..6b0cf38280 100644 --- a/swift/container/server.py +++ b/swift/container/server.py @@ -141,7 +141,7 @@ class ContainerController(object): return None @public - @timing_stats + @timing_stats() def DELETE(self, req): """Handle HTTP DELETE request.""" try: @@ -184,7 +184,7 @@ class ContainerController(object): return HTTPNotFound() @public - @timing_stats + @timing_stats() def PUT(self, req): """Handle HTTP PUT request.""" try: @@ -248,7 +248,7 @@ class ContainerController(object): return HTTPAccepted(request=req) @public - @timing_stats + @timing_stats(sample_rate=0.1) def HEAD(self, req): """Handle HTTP HEAD request.""" try: @@ -287,7 +287,7 @@ class ContainerController(object): return HTTPNoContent(request=req, headers=headers, charset='utf-8') @public - @timing_stats + @timing_stats() def GET(self, req): """Handle HTTP GET request.""" try: @@ -395,7 +395,7 @@ class ContainerController(object): return ret @public - @timing_stats + @timing_stats(sample_rate=0.01) def REPLICATE(self, req): """ Handle HTTP REPLICATE request (json-encoded RPC calls for replication.) @@ -418,7 +418,7 @@ class ContainerController(object): return ret @public - @timing_stats + @timing_stats() def POST(self, req): """Handle HTTP POST request.""" try: diff --git a/swift/obj/server.py b/swift/obj/server.py index 2bfe5ec064..8778e737d9 100755 --- a/swift/obj/server.py +++ b/swift/obj/server.py @@ -534,7 +534,7 @@ class ObjectController(object): host, partition, contdevice, headers_out, objdevice) @public - @timing_stats + @timing_stats() def POST(self, request): """Handle HTTP POST requests for the Swift Object Server.""" try: @@ -583,7 +583,7 @@ class ObjectController(object): return HTTPAccepted(request=request) @public - @timing_stats + @timing_stats() def PUT(self, request): """Handle HTTP PUT requests for the Swift Object Server.""" try: @@ -683,7 +683,7 @@ class ObjectController(object): return resp @public - @timing_stats + @timing_stats() def GET(self, request): """Handle HTTP GET requests for the Swift Object Server.""" try: @@ -763,7 +763,7 @@ class ObjectController(object): return request.get_response(response) @public - @timing_stats + @timing_stats(sample_rate=0.8) def HEAD(self, request): """Handle HTTP HEAD requests for the Swift Object Server.""" try: @@ -803,7 +803,7 @@ class ObjectController(object): return response @public - @timing_stats + @timing_stats() def DELETE(self, request): """Handle HTTP DELETE requests for the Swift Object Server.""" try: @@ -851,7 +851,7 @@ class ObjectController(object): return resp @public - @timing_stats + @timing_stats(sample_rate=0.1) def REPLICATE(self, request): """ Handle REPLICATE requests for the Swift Object Server. This is used diff --git a/test/unit/common/test_utils.py b/test/unit/common/test_utils.py index 25a9b45226..eea3350caf 100644 --- a/test/unit/common/test_utils.py +++ b/test/unit/common/test_utils.py @@ -34,6 +34,7 @@ from shutil import rmtree from StringIO import StringIO from functools import partial from tempfile import TemporaryFile, NamedTemporaryFile +from logging import handlers as logging_handlers from eventlet import sleep @@ -377,6 +378,62 @@ class TestUtils(unittest.TestCase): self.assertEquals(sio.getvalue(), 'test1\ntest3\ntest4\ntest6\n') + def test_get_logger_sysloghandler_plumbing(self): + orig_sysloghandler = utils.SysLogHandler + syslog_handler_args = [] + def syslog_handler_catcher(*args, **kwargs): + syslog_handler_args.append((args, kwargs)) + return orig_sysloghandler(*args, **kwargs) + syslog_handler_catcher.LOG_LOCAL0 = orig_sysloghandler.LOG_LOCAL0 + syslog_handler_catcher.LOG_LOCAL3 = orig_sysloghandler.LOG_LOCAL3 + + try: + utils.SysLogHandler = syslog_handler_catcher + logger = utils.get_logger({ + 'log_facility': 'LOG_LOCAL3', + }, 'server', log_route='server') + self.assertEquals([ + ((), {'address': '/dev/log', + 'facility': orig_sysloghandler.LOG_LOCAL3})], + syslog_handler_args) + + syslog_handler_args = [] + logger = utils.get_logger({ + 'log_facility': 'LOG_LOCAL3', + 'log_address': '/foo/bar', + }, 'server', log_route='server') + self.assertEquals([ + ((), {'address': '/foo/bar', + 'facility': orig_sysloghandler.LOG_LOCAL3}), + # Second call is because /foo/bar didn't exist (and wasn't a + # UNIX domain socket). + ((), {'facility': orig_sysloghandler.LOG_LOCAL3})], + syslog_handler_args) + + # Using UDP with default port + syslog_handler_args = [] + logger = utils.get_logger({ + 'log_udp_host': 'syslog.funtimes.com', + }, 'server', log_route='server') + self.assertEquals([ + ((), {'address': ('syslog.funtimes.com', + logging.handlers.SYSLOG_UDP_PORT), + 'facility': orig_sysloghandler.LOG_LOCAL0})], + syslog_handler_args) + + # Using UDP with non-default port + syslog_handler_args = [] + logger = utils.get_logger({ + 'log_udp_host': 'syslog.funtimes.com', + 'log_udp_port': '2123', + }, 'server', log_route='server') + self.assertEquals([ + ((), {'address': ('syslog.funtimes.com', 2123), + 'facility': orig_sysloghandler.LOG_LOCAL0})], + syslog_handler_args) + finally: + utils.SysLogHandler = orig_sysloghandler + def test_clean_logger_exception(self): # setup stream logging sio = StringIO() @@ -960,8 +1017,9 @@ class TestStatsdLogging(unittest.TestCase): def test_get_logger_statsd_client_non_defaults(self): logger = utils.get_logger({ 'log_statsd_host': 'another.host.com', - 'log_statsd_port': 9876, - 'log_statsd_default_sample_rate': 0.75, + 'log_statsd_port': '9876', + 'log_statsd_default_sample_rate': '0.75', + 'log_statsd_sample_rate_factor': '0.81', 'log_statsd_metric_prefix': 'tomato.sauce', }, 'some-name', log_route='some-route') self.assertEqual(logger.logger.statsd_client._prefix, @@ -975,6 +1033,8 @@ class TestStatsdLogging(unittest.TestCase): self.assertEqual(logger.logger.statsd_client._port, 9876) self.assertEqual(logger.logger.statsd_client._default_sample_rate, 0.75) + self.assertEqual(logger.logger.statsd_client._sample_rate_factor, + 0.81) def test_sample_rates(self): logger = utils.get_logger({'log_statsd_host': 'some.host.com'}) @@ -997,6 +1057,42 @@ class TestStatsdLogging(unittest.TestCase): payload = mock_socket.sent[0][0] self.assertTrue(payload.endswith("|@0.5")) + def test_sample_rates_with_sample_rate_factor(self): + logger = utils.get_logger({ + 'log_statsd_host': 'some.host.com', + 'log_statsd_default_sample_rate': '0.82', + 'log_statsd_sample_rate_factor': '0.91', + }) + effective_sample_rate = 0.82 * 0.91 + + mock_socket = MockUdpSocket() + # encapsulation? what's that? + statsd_client = logger.logger.statsd_client + self.assertTrue(statsd_client.random is random.random) + + statsd_client._open_socket = lambda *_: mock_socket + statsd_client.random = lambda: effective_sample_rate + 0.001 + + logger.increment('tribbles') + self.assertEqual(len(mock_socket.sent), 0) + + statsd_client.random = lambda: effective_sample_rate - 0.001 + logger.increment('tribbles') + self.assertEqual(len(mock_socket.sent), 1) + + payload = mock_socket.sent[0][0] + self.assertTrue(payload.endswith("|@%s" % effective_sample_rate), + payload) + + effective_sample_rate = 0.587 * 0.91 + statsd_client.random = lambda: effective_sample_rate - 0.001 + logger.increment('tribbles', sample_rate=0.587) + self.assertEqual(len(mock_socket.sent), 2) + + payload = mock_socket.sent[1][0] + self.assertTrue(payload.endswith("|@%s" % effective_sample_rate), + payload) + def test_timing_stats(self): class MockController(object): def __init__(self, status): @@ -1009,7 +1105,7 @@ class TestStatsdLogging(unittest.TestCase): self.called = 'timing' self.args = args - @utils.timing_stats + @utils.timing_stats() def METHOD(controller): return Response(status=controller.status)