Added per disk PUT timing monitoring support.

Fixes bug 1104708

There could be severe performance drop for swift is one disk of one
storage node is problematic due to the tragic state of async disk I/O.

This patch provided PUT timing per kB transfered (ms/kB) monitoring
support for each non-zero-byte request of each disk and report to
statsD for alert.
-adding "object-server.PUT.<device>.timing" metrics for object-server.

DocImpact.

Change-Id: Ie94bddad28e8be52e71683bf6c9db988664abe47
This commit is contained in:
Jian Zhang 2013-02-24 00:02:32 -08:00
parent b6b5d6670d
commit 1d8a02f25c
3 changed files with 17 additions and 0 deletions

View File

@ -756,6 +756,9 @@ Metric Name Description
`object-server.PUT.timeouts` Count of object PUTs which exceeded max_upload_time. `object-server.PUT.timeouts` Count of object PUTs which exceeded max_upload_time.
`object-server.PUT.timing` Timing data for each PUT request not resulting in an `object-server.PUT.timing` Timing data for each PUT request not resulting in an
error. error.
`object-server.PUT.<device>.timing` Timing data per kB transfered (ms/kB) for each
non-zero-byte PUT request on each device.
Monitoring problematic devices, higher is bad.
`object-server.GET.errors.timing` Timing data for GET request errors: bad request, `object-server.GET.errors.timing` Timing data for GET request errors: bad request,
not mounted, header timestamps before the epoch, not mounted, header timestamps before the epoch,
precondition failed. precondition failed.

View File

@ -508,6 +508,12 @@ class StatsdClient(object):
return self.timing(metric, (time.time() - orig_time) * 1000, return self.timing(metric, (time.time() - orig_time) * 1000,
sample_rate) sample_rate)
def transfer_rate(self, metric, elasped_time, byte_xfer, sample_rate=None):
if byte_xfer:
return self.timing(metric,
elasped_time * 1000 / byte_xfer * 1000,
sample_rate)
def timing_stats(**dec_kwargs): def timing_stats(**dec_kwargs):
""" """
@ -662,6 +668,7 @@ class LogAdapter(logging.LoggerAdapter, object):
decrement = statsd_delegate('decrement') decrement = statsd_delegate('decrement')
timing = statsd_delegate('timing') timing = statsd_delegate('timing')
timing_since = statsd_delegate('timing_since') timing_since = statsd_delegate('timing_since')
transfer_rate = statsd_delegate('transfer_rate')
class SwiftLogFormatter(logging.Formatter): class SwiftLogFormatter(logging.Formatter):

View File

@ -642,6 +642,7 @@ class ObjectController(object):
etag = md5() etag = md5()
upload_size = 0 upload_size = 0
last_sync = 0 last_sync = 0
elasped_time = 0
with file.mkstemp() as fd: with file.mkstemp() as fd:
try: try:
fallocate(fd, int(request.headers.get('content-length', 0))) fallocate(fd, int(request.headers.get('content-length', 0)))
@ -649,6 +650,7 @@ class ObjectController(object):
return HTTPInsufficientStorage(drive=device, request=request) return HTTPInsufficientStorage(drive=device, request=request)
reader = request.environ['wsgi.input'].read reader = request.environ['wsgi.input'].read
for chunk in iter(lambda: reader(self.network_chunk_size), ''): for chunk in iter(lambda: reader(self.network_chunk_size), ''):
start_time = time.time()
upload_size += len(chunk) upload_size += len(chunk)
if time.time() > upload_expiration: if time.time() > upload_expiration:
self.logger.increment('PUT.timeouts') self.logger.increment('PUT.timeouts')
@ -663,6 +665,11 @@ class ObjectController(object):
drop_buffer_cache(fd, last_sync, upload_size - last_sync) drop_buffer_cache(fd, last_sync, upload_size - last_sync)
last_sync = upload_size last_sync = upload_size
sleep() sleep()
elasped_time += time.time() - start_time
if upload_size:
self.logger.transfer_rate(
'PUT.' + device + '.timing', elasped_time, upload_size)
if 'content-length' in request.headers and \ if 'content-length' in request.headers and \
int(request.headers['content-length']) != upload_size: int(request.headers['content-length']) != upload_size: