Make error limits survive a ring reload

The proxy was storing the error count and last-error time in the
ring's internal data, specifically in the device dictionaries. This
works okay, but it means that whenever a ring changes, all the error
stats reset.

Now the error stats live in the proxy server object, so they survive a
ring reload.

Better yet, the error stats are now keyed off of the node's
IP/port/device triple, so if you have the same device in two rings
(like with multiple storage policies), then the error stats are
combined. If the proxy server sees a 507 for an objec request in
policy X, then that will now result in that particular object disk
being error-limited for requests in policies Y and Z as well.

Change-Id: Icc72b68b99f37367bb16d43688e7e45327e3e022
This commit is contained in:
Samuel Merritt 2014-11-13 16:40:05 -08:00
parent 75cca49334
commit e429cd81be
3 changed files with 151 additions and 75 deletions

View File

@ -76,6 +76,8 @@ class Application(object):
else: else:
self.logger = logger self.logger = logger
self._error_limiting = {}
swift_dir = conf.get('swift_dir', '/etc/swift') swift_dir = conf.get('swift_dir', '/etc/swift')
self.swift_dir = swift_dir self.swift_dir = swift_dir
self.node_timeout = int(conf.get('node_timeout', 10)) self.node_timeout = int(conf.get('node_timeout', 10))
@ -406,6 +408,9 @@ class Application(object):
timing = round(timing, 3) # sort timings to the millisecond timing = round(timing, 3) # sort timings to the millisecond
self.node_timings[node['ip']] = (timing, now + self.timing_expiry) self.node_timings[node['ip']] = (timing, now + self.timing_expiry)
def _error_limit_node_key(self, node):
return "{ip}:{port}/{device}".format(**node)
def error_limited(self, node): def error_limited(self, node):
""" """
Check if the node is currently error limited. Check if the node is currently error limited.
@ -414,15 +419,16 @@ class Application(object):
:returns: True if error limited, False otherwise :returns: True if error limited, False otherwise
""" """
now = time() now = time()
if 'errors' not in node: node_key = self._error_limit_node_key(node)
error_stats = self._error_limiting.get(node_key)
if error_stats is None or 'errors' not in error_stats:
return False return False
if 'last_error' in node and node['last_error'] < \ if 'last_error' in error_stats and error_stats['last_error'] < \
now - self.error_suppression_interval: now - self.error_suppression_interval:
del node['last_error'] self._error_limiting.pop(node_key, None)
if 'errors' in node:
del node['errors']
return False return False
limited = node['errors'] > self.error_suppression_limit limited = error_stats['errors'] > self.error_suppression_limit
if limited: if limited:
self.logger.debug( self.logger.debug(
_('Node error limited %(ip)s:%(port)s (%(device)s)'), node) _('Node error limited %(ip)s:%(port)s (%(device)s)'), node)
@ -438,8 +444,10 @@ class Application(object):
:param node: dictionary of node to error limit :param node: dictionary of node to error limit
:param msg: error message :param msg: error message
""" """
node['errors'] = self.error_suppression_limit + 1 node_key = self._error_limit_node_key(node)
node['last_error'] = time() error_stats = self._error_limiting.setdefault(node_key, {})
error_stats['errors'] = self.error_suppression_limit + 1
error_stats['last_error'] = time()
self.logger.error(_('%(msg)s %(ip)s:%(port)s/%(device)s'), self.logger.error(_('%(msg)s %(ip)s:%(port)s/%(device)s'),
{'msg': msg, 'ip': node['ip'], {'msg': msg, 'ip': node['ip'],
'port': node['port'], 'device': node['device']}) 'port': node['port'], 'device': node['device']})
@ -451,8 +459,10 @@ class Application(object):
:param node: dictionary of node to handle errors for :param node: dictionary of node to handle errors for
:param msg: error message :param msg: error message
""" """
node['errors'] = node.get('errors', 0) + 1 node_key = self._error_limit_node_key(node)
node['last_error'] = time() error_stats = self._error_limiting.setdefault(node_key, {})
error_stats['errors'] = error_stats.get('errors', 0) + 1
error_stats['last_error'] = time()
self.logger.error(_('%(msg)s %(ip)s:%(port)s/%(device)s'), self.logger.error(_('%(msg)s %(ip)s:%(port)s/%(device)s'),
{'msg': msg, 'ip': node['ip'], {'msg': msg, 'ip': node['ip'],
'port': node['port'], 'device': node['device']}) 'port': node['port'], 'device': node['device']})

View File

@ -124,7 +124,8 @@ class PatchPolicies(object):
class FakeRing(Ring): class FakeRing(Ring):
def __init__(self, replicas=3, max_more_nodes=0, part_power=0): def __init__(self, replicas=3, max_more_nodes=0, part_power=0,
base_port=1000):
""" """
:param part_power: make part calculation based on the path :param part_power: make part calculation based on the path
@ -132,27 +133,23 @@ class FakeRing(Ring):
out of ring methods will actually be based on the path - otherwise we out of ring methods will actually be based on the path - otherwise we
exercise the real ring code, but ignore the result and return 1. exercise the real ring code, but ignore the result and return 1.
""" """
self._base_port = base_port
self.max_more_nodes = max_more_nodes
self._part_shift = 32 - part_power
# 9 total nodes (6 more past the initial 3) is the cap, no matter if # 9 total nodes (6 more past the initial 3) is the cap, no matter if
# this is set higher, or R^2 for R replicas # this is set higher, or R^2 for R replicas
self.set_replicas(replicas) self.set_replicas(replicas)
self.max_more_nodes = max_more_nodes
self._part_shift = 32 - part_power
self._reload() self._reload()
def _reload(self): def _reload(self):
self._rtime = time.time() self._rtime = time.time()
def clear_errors(self):
for dev in self.devs:
for key in ('errors', 'last_error'):
dev.pop(key, None)
def set_replicas(self, replicas): def set_replicas(self, replicas):
self.replicas = replicas self.replicas = replicas
self._devs = [] self._devs = []
for x in range(self.replicas): for x in range(self.replicas):
ip = '10.0.0.%s' % x ip = '10.0.0.%s' % x
port = 1000 + x port = self._base_port + x
self._devs.append({ self._devs.append({
'ip': ip, 'ip': ip,
'replication_ip': ip, 'replication_ip': ip,
@ -176,7 +173,7 @@ class FakeRing(Ring):
for x in xrange(self.replicas, min(self.replicas + self.max_more_nodes, for x in xrange(self.replicas, min(self.replicas + self.max_more_nodes,
self.replicas * self.replicas)): self.replicas * self.replicas)):
yield {'ip': '10.0.0.%s' % x, yield {'ip': '10.0.0.%s' % x,
'port': 1000 + x, 'port': self._base_port + x,
'device': 'sda', 'device': 'sda',
'zone': x % 3, 'zone': x % 3,
'region': x % 2, 'region': x % 2,

View File

@ -280,6 +280,28 @@ def sortHeaderNames(headerNames):
return ', '.join(headers) return ', '.join(headers)
def node_error_count(proxy_app, ring_node):
# Reach into the proxy's internals to get the error count for a
# particular node
node_key = proxy_app._error_limit_node_key(ring_node)
return proxy_app._error_limiting.get(node_key, {}).get('errors', 0)
def node_last_error(proxy_app, ring_node):
# Reach into the proxy's internals to get the last error for a
# particular node
node_key = proxy_app._error_limit_node_key(ring_node)
return proxy_app._error_limiting.get(node_key, {}).get('last_error')
def set_node_errors(proxy_app, ring_node, value, last_error):
# Set the node's error count to value
node_key = proxy_app._error_limit_node_key(ring_node)
stats = proxy_app._error_limiting.setdefault(node_key, {})
stats['errors'] = value
stats['last_error'] = last_error
class FakeMemcacheReturnsNone(FakeMemcache): class FakeMemcacheReturnsNone(FakeMemcache):
def get(self, key): def get(self, key):
@ -923,20 +945,22 @@ class TestProxyServerLoading(unittest.TestCase):
self.assert_(policy.object_ring) self.assert_(policy.object_ring)
@patch_policies([StoragePolicy(0, 'zero', True, object_ring=FakeRing())]) @patch_policies([StoragePolicy(0, 'zero', True,
object_ring=FakeRing(base_port=3000))])
class TestObjectController(unittest.TestCase): class TestObjectController(unittest.TestCase):
def setUp(self): def setUp(self):
self.app = proxy_server.Application(None, FakeMemcache(), self.app = proxy_server.Application(
logger=debug_logger('proxy-ut'), None, FakeMemcache(),
account_ring=FakeRing(), logger=debug_logger('proxy-ut'),
container_ring=FakeRing()) account_ring=FakeRing(),
container_ring=FakeRing())
def tearDown(self): def tearDown(self):
self.app.account_ring.set_replicas(3) self.app.account_ring.set_replicas(3)
self.app.container_ring.set_replicas(3) self.app.container_ring.set_replicas(3)
for policy in POLICIES: for policy in POLICIES:
policy.object_ring = FakeRing() policy.object_ring = FakeRing(base_port=3000)
def assert_status_map(self, method, statuses, expected, raise_exc=False): def assert_status_map(self, method, statuses, expected, raise_exc=False):
with save_globals(): with save_globals():
@ -2493,9 +2517,9 @@ class TestObjectController(unittest.TestCase):
self.app.log_handoffs = True self.app.log_handoffs = True
self.app.logger = FakeLogger() self.app.logger = FakeLogger()
self.app.request_node_count = lambda r: 7 self.app.request_node_count = lambda r: 7
object_ring.clear_errors() self.app._error_limiting = {} # clear out errors
object_ring._devs[0]['errors'] = 999 set_node_errors(self.app, object_ring._devs[0], 999,
object_ring._devs[0]['last_error'] = 2 ** 63 - 1 last_error=(2 ** 63 - 1))
collected_nodes = [] collected_nodes = []
for node in self.app.iter_nodes(object_ring, partition): for node in self.app.iter_nodes(object_ring, partition):
@ -2510,10 +2534,10 @@ class TestObjectController(unittest.TestCase):
self.app.log_handoffs = True self.app.log_handoffs = True
self.app.logger = FakeLogger() self.app.logger = FakeLogger()
self.app.request_node_count = lambda r: 7 self.app.request_node_count = lambda r: 7
object_ring.clear_errors() self.app._error_limiting = {} # clear out errors
for i in range(2): for i in range(2):
object_ring._devs[i]['errors'] = 999 set_node_errors(self.app, object_ring._devs[i], 999,
object_ring._devs[i]['last_error'] = 2 ** 63 - 1 last_error=(2 ** 63 - 1))
collected_nodes = [] collected_nodes = []
for node in self.app.iter_nodes(object_ring, partition): for node in self.app.iter_nodes(object_ring, partition):
@ -2532,10 +2556,10 @@ class TestObjectController(unittest.TestCase):
self.app.logger = FakeLogger() self.app.logger = FakeLogger()
self.app.request_node_count = lambda r: 10 self.app.request_node_count = lambda r: 10
object_ring.set_replicas(4) # otherwise we run out of handoffs object_ring.set_replicas(4) # otherwise we run out of handoffs
object_ring.clear_errors() self.app._error_limiting = {} # clear out errors
for i in range(4): for i in range(4):
object_ring._devs[i]['errors'] = 999 set_node_errors(self.app, object_ring._devs[i], 999,
object_ring._devs[i]['last_error'] = 2 ** 63 - 1 last_error=(2 ** 63 - 1))
collected_nodes = [] collected_nodes = []
for node in self.app.iter_nodes(object_ring, partition): for node in self.app.iter_nodes(object_ring, partition):
@ -2593,7 +2617,8 @@ class TestObjectController(unittest.TestCase):
def test_iter_nodes_with_custom_node_iter(self): def test_iter_nodes_with_custom_node_iter(self):
object_ring = self.app.get_object_ring(None) object_ring = self.app.get_object_ring(None)
node_list = [dict(id=n) for n in xrange(10)] node_list = [dict(id=n, ip='1.2.3.4', port=n, device='D')
for n in xrange(10)]
with nested( with nested(
mock.patch.object(self.app, 'sort_nodes', lambda n: n), mock.patch.object(self.app, 'sort_nodes', lambda n: n),
mock.patch.object(self.app, 'request_node_count', mock.patch.object(self.app, 'request_node_count',
@ -2674,16 +2699,20 @@ class TestObjectController(unittest.TestCase):
object_ring = controller.app.get_object_ring(None) object_ring = controller.app.get_object_ring(None)
self.assert_status_map(controller.HEAD, (200, 200, 503, 200, 200), self.assert_status_map(controller.HEAD, (200, 200, 503, 200, 200),
200) 200)
self.assertEquals(object_ring.devs[0]['errors'], 2) self.assertEquals(
self.assert_('last_error' in object_ring.devs[0]) node_error_count(controller.app, object_ring.devs[0]), 2)
self.assert_(node_last_error(controller.app, object_ring.devs[0])
is not None)
for _junk in xrange(self.app.error_suppression_limit): for _junk in xrange(self.app.error_suppression_limit):
self.assert_status_map(controller.HEAD, (200, 200, 503, 503, self.assert_status_map(controller.HEAD, (200, 200, 503, 503,
503), 503) 503), 503)
self.assertEquals(object_ring.devs[0]['errors'], self.assertEquals(
self.app.error_suppression_limit + 1) node_error_count(controller.app, object_ring.devs[0]),
self.app.error_suppression_limit + 1)
self.assert_status_map(controller.HEAD, (200, 200, 200, 200, 200), self.assert_status_map(controller.HEAD, (200, 200, 200, 200, 200),
503) 503)
self.assert_('last_error' in object_ring.devs[0]) self.assert_(node_last_error(controller.app, object_ring.devs[0])
is not None)
self.assert_status_map(controller.PUT, (200, 200, 200, 201, 201, self.assert_status_map(controller.PUT, (200, 200, 200, 201, 201,
201), 503) 201), 503)
self.assert_status_map(controller.POST, self.assert_status_map(controller.POST,
@ -2699,6 +2728,34 @@ class TestObjectController(unittest.TestCase):
(200, 200, 200, 204, 204, 204), 503, (200, 200, 200, 204, 204, 204), 503,
raise_exc=True) raise_exc=True)
def test_error_limiting_survives_ring_reload(self):
with save_globals():
controller = proxy_server.ObjectController(self.app, 'account',
'container', 'object')
controller.app.sort_nodes = lambda l: l
object_ring = controller.app.get_object_ring(None)
self.assert_status_map(controller.HEAD, (200, 200, 503, 200, 200),
200)
self.assertEquals(
node_error_count(controller.app, object_ring.devs[0]), 2)
self.assert_(node_last_error(controller.app, object_ring.devs[0])
is not None)
for _junk in xrange(self.app.error_suppression_limit):
self.assert_status_map(controller.HEAD, (200, 200, 503, 503,
503), 503)
self.assertEquals(
node_error_count(controller.app, object_ring.devs[0]),
self.app.error_suppression_limit + 1)
# wipe out any state in the ring
for policy in POLICIES:
policy.object_ring = FakeRing(base_port=3000)
# and we still get an error, which proves that the
# error-limiting info survived a ring reload
self.assert_status_map(controller.HEAD, (200, 200, 200, 200, 200),
503)
def test_PUT_error_limiting(self): def test_PUT_error_limiting(self):
with save_globals(): with save_globals():
controller = proxy_server.ObjectController(self.app, 'account', controller = proxy_server.ObjectController(self.app, 'account',
@ -2710,12 +2767,13 @@ class TestObjectController(unittest.TestCase):
200) 200)
# 2, not 1, because assert_status_map() calls the method twice # 2, not 1, because assert_status_map() calls the method twice
self.assertEquals(object_ring.devs[0].get('errors', 0), 2) odevs = object_ring.devs
self.assertEquals(object_ring.devs[1].get('errors', 0), 0) self.assertEquals(node_error_count(controller.app, odevs[0]), 2)
self.assertEquals(object_ring.devs[2].get('errors', 0), 0) self.assertEquals(node_error_count(controller.app, odevs[1]), 0)
self.assert_('last_error' in object_ring.devs[0]) self.assertEquals(node_error_count(controller.app, odevs[2]), 0)
self.assert_('last_error' not in object_ring.devs[1]) self.assert_(node_last_error(controller.app, odevs[0]) is not None)
self.assert_('last_error' not in object_ring.devs[2]) self.assert_(node_last_error(controller.app, odevs[1]) is None)
self.assert_(node_last_error(controller.app, odevs[2]) is None)
def test_PUT_error_limiting_last_node(self): def test_PUT_error_limiting_last_node(self):
with save_globals(): with save_globals():
@ -2728,18 +2786,18 @@ class TestObjectController(unittest.TestCase):
200) 200)
# 2, not 1, because assert_status_map() calls the method twice # 2, not 1, because assert_status_map() calls the method twice
self.assertEquals(object_ring.devs[0].get('errors', 0), 0) odevs = object_ring.devs
self.assertEquals(object_ring.devs[1].get('errors', 0), 0) self.assertEquals(node_error_count(controller.app, odevs[0]), 0)
self.assertEquals(object_ring.devs[2].get('errors', 0), 2) self.assertEquals(node_error_count(controller.app, odevs[1]), 0)
self.assert_('last_error' not in object_ring.devs[0]) self.assertEquals(node_error_count(controller.app, odevs[2]), 2)
self.assert_('last_error' not in object_ring.devs[1]) self.assert_(node_last_error(controller.app, odevs[0]) is None)
self.assert_('last_error' in object_ring.devs[2]) self.assert_(node_last_error(controller.app, odevs[1]) is None)
self.assert_(node_last_error(controller.app, odevs[2]) is not None)
def test_acc_or_con_missing_returns_404(self): def test_acc_or_con_missing_returns_404(self):
with save_globals(): with save_globals():
self.app.memcache = FakeMemcacheReturnsNone() self.app.memcache = FakeMemcacheReturnsNone()
self.app.account_ring.clear_errors() self.app._error_limiting = {}
self.app.container_ring.clear_errors()
controller = proxy_server.ObjectController(self.app, 'account', controller = proxy_server.ObjectController(self.app, 'account',
'container', 'object') 'container', 'object')
set_http_connect(200, 200, 200, 200, 200, 200) set_http_connect(200, 200, 200, 200, 200, 200)
@ -2806,8 +2864,9 @@ class TestObjectController(unittest.TestCase):
self.assertEquals(resp.status_int, 404) self.assertEquals(resp.status_int, 404)
for dev in self.app.account_ring.devs: for dev in self.app.account_ring.devs:
dev['errors'] = self.app.error_suppression_limit + 1 set_node_errors(
dev['last_error'] = time.time() self.app, dev, self.app.error_suppression_limit + 1,
time.time())
set_http_connect(200) set_http_connect(200)
# acct [isn't actually called since everything # acct [isn't actually called since everything
# is error limited] # is error limited]
@ -2818,10 +2877,11 @@ class TestObjectController(unittest.TestCase):
self.assertEquals(resp.status_int, 404) self.assertEquals(resp.status_int, 404)
for dev in self.app.account_ring.devs: for dev in self.app.account_ring.devs:
dev['errors'] = 0 set_node_errors(self.app, dev, 0, last_error=None)
for dev in self.app.container_ring.devs: for dev in self.app.container_ring.devs:
dev['errors'] = self.app.error_suppression_limit + 1 set_node_errors(self.app, dev,
dev['last_error'] = time.time() self.app.error_suppression_limit + 1,
time.time())
set_http_connect(200, 200) set_http_connect(200, 200)
# acct cont [isn't actually called since # acct cont [isn't actually called since
# everything is error limited] # everything is error limited]
@ -5170,18 +5230,19 @@ class TestObjectController(unittest.TestCase):
@patch_policies([ @patch_policies([
StoragePolicy(0, 'zero', True, object_ring=FakeRing()), StoragePolicy(0, 'zero', True, object_ring=FakeRing(base_port=3000)),
StoragePolicy(1, 'one', False, object_ring=FakeRing()), StoragePolicy(1, 'one', False, object_ring=FakeRing(base_port=3000)),
StoragePolicy(2, 'two', False, True, object_ring=FakeRing()) StoragePolicy(2, 'two', False, True, object_ring=FakeRing(base_port=3000))
]) ])
class TestContainerController(unittest.TestCase): class TestContainerController(unittest.TestCase):
"Test swift.proxy_server.ContainerController" "Test swift.proxy_server.ContainerController"
def setUp(self): def setUp(self):
self.app = proxy_server.Application(None, FakeMemcache(), self.app = proxy_server.Application(
account_ring=FakeRing(), None, FakeMemcache(),
container_ring=FakeRing(), account_ring=FakeRing(),
logger=debug_logger()) container_ring=FakeRing(base_port=2000),
logger=debug_logger())
def test_convert_policy_to_index(self): def test_convert_policy_to_index(self):
controller = swift.proxy.controllers.ContainerController(self.app, controller = swift.proxy.controllers.ContainerController(self.app,
@ -5598,7 +5659,7 @@ class TestContainerController(unittest.TestCase):
for meth in ('DELETE', 'PUT'): for meth in ('DELETE', 'PUT'):
with save_globals(): with save_globals():
self.app.memcache = FakeMemcacheReturnsNone() self.app.memcache = FakeMemcacheReturnsNone()
self.app.account_ring.clear_errors() self.app._error_limiting = {}
controller = proxy_server.ContainerController(self.app, controller = proxy_server.ContainerController(self.app,
'account', 'account',
'container') 'container')
@ -5636,8 +5697,9 @@ class TestContainerController(unittest.TestCase):
self.assertEquals(resp.status_int, 404) self.assertEquals(resp.status_int, 404)
for dev in self.app.account_ring.devs: for dev in self.app.account_ring.devs:
dev['errors'] = self.app.error_suppression_limit + 1 set_node_errors(self.app, dev,
dev['last_error'] = time.time() self.app.error_suppression_limit + 1,
time.time())
set_http_connect(200, 200, 200, 200, 200, 200) set_http_connect(200, 200, 200, 200, 200, 200)
# Make sure it is a blank request wthout env caching # Make sure it is a blank request wthout env caching
req = Request.blank('/v1/a/c', req = Request.blank('/v1/a/c',
@ -5675,19 +5737,26 @@ class TestContainerController(unittest.TestCase):
with save_globals(): with save_globals():
controller = proxy_server.ContainerController(self.app, 'account', controller = proxy_server.ContainerController(self.app, 'account',
'container') 'container')
container_ring = controller.app.container_ring
controller.app.sort_nodes = lambda l: l controller.app.sort_nodes = lambda l: l
self.assert_status_map(controller.HEAD, (200, 503, 200, 200), 200, self.assert_status_map(controller.HEAD, (200, 503, 200, 200), 200,
missing_container=False) missing_container=False)
self.assertEquals( self.assertEquals(
controller.app.container_ring.devs[0]['errors'], 2) node_error_count(controller.app, container_ring.devs[0]), 2)
self.assert_('last_error' in controller.app.container_ring.devs[0]) self.assert_(
node_last_error(controller.app, container_ring.devs[0])
is not None)
for _junk in xrange(self.app.error_suppression_limit): for _junk in xrange(self.app.error_suppression_limit):
self.assert_status_map(controller.HEAD, self.assert_status_map(controller.HEAD,
(200, 503, 503, 503), 503) (200, 503, 503, 503), 503)
self.assertEquals(controller.app.container_ring.devs[0]['errors'], self.assertEquals(
self.app.error_suppression_limit + 1) node_error_count(controller.app, container_ring.devs[0]),
self.app.error_suppression_limit + 1)
self.assert_status_map(controller.HEAD, (200, 200, 200, 200), 503) self.assert_status_map(controller.HEAD, (200, 200, 200, 200), 503)
self.assert_('last_error' in controller.app.container_ring.devs[0]) self.assert_(
node_last_error(controller.app, container_ring.devs[0])
is not None)
self.assert_status_map(controller.PUT, (200, 201, 201, 201), 503, self.assert_status_map(controller.PUT, (200, 201, 201, 201), 503,
missing_container=True) missing_container=True)
self.assert_status_map(controller.DELETE, self.assert_status_map(controller.DELETE,