From 783f16035a8e251d2138eb5bbaa459e9e4486d90 Mon Sep 17 00:00:00 2001 From: Samuel Merritt Date: Fri, 1 Jun 2012 15:24:21 -0700 Subject: [PATCH] Fix starvation in object server with fast clients. When an object server was handling concurrent GET or POST requests from very fast clients, it would starve other connected clients. The greenthreads responsible for servicing the fast clients would hog the processor and only rarely yield to another greenthread. The reason this happens for GET requests is found in eventlet.greenio.GreenSocket, in the send() method. When you call .send(data) on a GreenSocket, it immediately calls .send(data) on its underlying real socket (socket._socketobject). If the real socket accepts all the data, then GreenSocket.send() returns without yielding to another greenthread. Only if the real socket failed to accept all the data (either .send(data) < len(data) or by raising EWOULDBLOCK) does the GreenSocket yield control. Under most workloads, this isn't a problem. The TCP connection to client X can only consume data so quickly, and therefore the greenthread serving client X will frequently encounter a full socket buffer and yield control, so no clients starve. However, when there's a lot of contention for a single object from a large number of fast clients (e.g. on a LAN connected w/10Gb Ethernet), then one winds up in a situation where reading from the disk is slower than writing to the network, and so full socket buffers become rare, and therefore so do context switches. The end result is that many clients time out waiting for data. The situation for PUT requests is analogous; GreenSocket.recv() seldom encounters EWOULDBLOCK, so greenthreads seldom yield. This patch calls eventlet.sleep() to yield control after each chunk, preventing any one greenthread's IO from blocking the hub for very long. This code has the flaw that it will greenthread-switch twice when a send() or recv() does block, but since there isn't a way to find out if a switch occurred or not, there's no way to avoid it. Since greenlet switches are quite fast (faster than system calls, which the object server does a lot of), this shouldn't have a significant performance impact. Change-Id: I8549adfb4a198739b80979236c27b76df607eebf --- swift/obj/server.py | 13 ++++++++++--- test/unit/obj/test_server.py | 18 ++++++++++++++++-- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/swift/obj/server.py b/swift/obj/server.py index 1669385a8e..9a636890ed 100644 --- a/swift/obj/server.py +++ b/swift/obj/server.py @@ -104,12 +104,15 @@ class DiskFile(object): :param container: container name for the object :param obj: object name for the object :param keep_data_fp: if True, don't close the fp, otherwise close it - :param disk_chunk_Size: size of chunks on file reads + :param disk_chunk_size: size of chunks on file reads + :param iter_hook: called when __iter__ returns a chunk """ def __init__(self, path, device, partition, account, container, obj, - logger, keep_data_fp=False, disk_chunk_size=65536): + logger, keep_data_fp=False, disk_chunk_size=65536, + iter_hook=None): self.disk_chunk_size = disk_chunk_size + self.iter_hook = iter_hook self.name = '/' + '/'.join((account, container, obj)) name_hash = hash_path(account, container, obj) self.datadir = os.path.join(path, device, @@ -173,6 +176,8 @@ class DiskFile(object): read - dropped_cache) dropped_cache = read yield chunk + if self.iter_hook: + self.iter_hook() else: self.read_to_eof = True self.drop_cache(self.fp.fileno(), dropped_cache, @@ -590,6 +595,7 @@ class ObjectController(object): tpool.execute(os.fdatasync, fd) drop_buffer_cache(fd, last_sync, upload_size - last_sync) last_sync = upload_size + sleep() if 'content-length' in request.headers and \ int(request.headers['content-length']) != upload_size: @@ -650,7 +656,8 @@ class ObjectController(object): return HTTPInsufficientStorage(drive=device, request=request) file = DiskFile(self.devices, device, partition, account, container, obj, self.logger, keep_data_fp=True, - disk_chunk_size=self.disk_chunk_size) + disk_chunk_size=self.disk_chunk_size, + iter_hook=sleep) if file.is_deleted() or ('X-Delete-At' in file.metadata and int(file.metadata['X-Delete-At']) <= time.time()): if request.headers.get('if-match') == '*': diff --git a/test/unit/obj/test_server.py b/test/unit/obj/test_server.py index c93ba67fcc..dd9d3c8cb4 100644 --- a/test/unit/obj/test_server.py +++ b/test/unit/obj/test_server.py @@ -90,6 +90,18 @@ class TestDiskFile(unittest.TestCase): 'o', FakeLogger()).mkstemp(): self.assert_(os.path.exists(tmpdir)) + def test_iter_hook(self): + hook_call_count = [0] + def hook(): + hook_call_count[0] += 1 + + df = self._get_data_file(fsize=65, csize=8, iter_hook=hook) + print repr(df.__dict__) + for _ in df: + pass + + self.assertEquals(hook_call_count[0], 9) + def test_quarantine(self): df = object_server.DiskFile(self.testdir, 'sda1', '0', 'a', 'c', 'o', FakeLogger()) @@ -136,7 +148,8 @@ class TestDiskFile(unittest.TestCase): self.assert_('-' in os.path.basename(double_uuid_path)) def _get_data_file(self, invalid_type=None, obj_name='o', - fsize=1024, csize=8, extension='.data', ts=None): + fsize=1024, csize=8, extension='.data', ts=None, + iter_hook=None): '''returns a DiskFile''' df = object_server.DiskFile(self.testdir, 'sda1', '0', 'a', 'c', obj_name, FakeLogger()) @@ -168,7 +181,8 @@ class TestDiskFile(unittest.TestCase): df = object_server.DiskFile(self.testdir, 'sda1', '0', 'a', 'c', obj_name, FakeLogger(), - keep_data_fp=True, disk_chunk_size=csize) + keep_data_fp=True, disk_chunk_size=csize, + iter_hook=iter_hook) if invalid_type == 'Zero-Byte': os.remove(df.data_file) fp = open(df.data_file, 'w')