From 1d8a02f25c8e439e7ce1c45638e9401ea2e8be26 Mon Sep 17 00:00:00 2001 From: Jian Zhang Date: Sun, 24 Feb 2013 00:02:32 -0800 Subject: [PATCH] Added per disk PUT timing monitoring support. Fixes bug 1104708 There could be severe performance drop for swift is one disk of one storage node is problematic due to the tragic state of async disk I/O. This patch provided PUT timing per kB transfered (ms/kB) monitoring support for each non-zero-byte request of each disk and report to statsD for alert. -adding "object-server.PUT..timing" metrics for object-server. DocImpact. Change-Id: Ie94bddad28e8be52e71683bf6c9db988664abe47 --- doc/source/admin_guide.rst | 3 +++ swift/common/utils.py | 7 +++++++ swift/obj/server.py | 7 +++++++ 3 files changed, 17 insertions(+) diff --git a/doc/source/admin_guide.rst b/doc/source/admin_guide.rst index 30d4ce5328..cdcd48b865 100644 --- a/doc/source/admin_guide.rst +++ b/doc/source/admin_guide.rst @@ -756,6 +756,9 @@ Metric Name Description `object-server.PUT.timeouts` Count of object PUTs which exceeded max_upload_time. `object-server.PUT.timing` Timing data for each PUT request not resulting in an error. +`object-server.PUT..timing` Timing data per kB transfered (ms/kB) for each + non-zero-byte PUT request on each device. + Monitoring problematic devices, higher is bad. `object-server.GET.errors.timing` Timing data for GET request errors: bad request, not mounted, header timestamps before the epoch, precondition failed. diff --git a/swift/common/utils.py b/swift/common/utils.py index a4d70552bb..d4d8026adb 100644 --- a/swift/common/utils.py +++ b/swift/common/utils.py @@ -508,6 +508,12 @@ class StatsdClient(object): return self.timing(metric, (time.time() - orig_time) * 1000, sample_rate) + def transfer_rate(self, metric, elasped_time, byte_xfer, sample_rate=None): + if byte_xfer: + return self.timing(metric, + elasped_time * 1000 / byte_xfer * 1000, + sample_rate) + def timing_stats(**dec_kwargs): """ @@ -662,6 +668,7 @@ class LogAdapter(logging.LoggerAdapter, object): decrement = statsd_delegate('decrement') timing = statsd_delegate('timing') timing_since = statsd_delegate('timing_since') + transfer_rate = statsd_delegate('transfer_rate') class SwiftLogFormatter(logging.Formatter): diff --git a/swift/obj/server.py b/swift/obj/server.py index 19e7cbdfdb..16b2beb54b 100644 --- a/swift/obj/server.py +++ b/swift/obj/server.py @@ -642,6 +642,7 @@ class ObjectController(object): etag = md5() upload_size = 0 last_sync = 0 + elasped_time = 0 with file.mkstemp() as fd: try: fallocate(fd, int(request.headers.get('content-length', 0))) @@ -649,6 +650,7 @@ class ObjectController(object): return HTTPInsufficientStorage(drive=device, request=request) reader = request.environ['wsgi.input'].read for chunk in iter(lambda: reader(self.network_chunk_size), ''): + start_time = time.time() upload_size += len(chunk) if time.time() > upload_expiration: self.logger.increment('PUT.timeouts') @@ -663,6 +665,11 @@ class ObjectController(object): drop_buffer_cache(fd, last_sync, upload_size - last_sync) last_sync = upload_size sleep() + elasped_time += time.time() - start_time + + if upload_size: + self.logger.transfer_rate( + 'PUT.' + device + '.timing', elasped_time, upload_size) if 'content-length' in request.headers and \ int(request.headers['content-length']) != upload_size: