proxy: emit metric when updating namespaces memcache set fails

Previously, if a memcache set command had an unexpected response then an error was logged but no exception raised from MemcacheRing.set. With this patch, when MemcacheRing.set fails in this way, a MemcacheConnectionError will be raised if 'raise_on_error' is True. This exception is now handled when the proxy tries to set updating namespaces in memcache, and a counter metric emitted with name 'object.shard_updating.cache.set_error'. If the set is successful an 'object.shard_updating.cache.set' counter metric is emitted. The logging of MemcacheRing errors is improved to include the memcache key being set, in line with exception logging. Change-Id: Icfc97751c80f4bb4a3373796c01bff6ed5843937
2023-10-20 14:40:06 +01:00 · 2023-10-20 14:40:06 +01:00 · 965908d753
commit 965908d753
parent 195f54ecb5
4 changed files with 188 additions and 38 deletions
--- a/swift/common/memcached.py
+++ b/swift/common/memcached.py
@ -245,6 +245,13 @@ class MemcacheRing(object):
    def memcache_servers(self):
        return list(self._client_cache.keys())
    def _log_error(self, server, cmd, action, msg):
        self.logger.error(
            "Error %(action)s to memcached: %(server)s"
            ": with key_prefix %(key_prefix)s, method %(method)s: %(msg)s",
            {'action': action, 'server': server, 'key_prefix': cmd.key_prefix,
             'method': cmd.method, 'msg': msg})
    """
    Handles exceptions.
@ -367,7 +374,8 @@ class MemcacheRing(object):
                self._exception_occurred(server, e, cmd, pool_start_time,
                                         action='connecting', sock=sock)
        if not any_yielded:
-            self.logger.error('All memcached servers error-limited')
+            self._log_error('ALL', cmd, 'connecting',
                            'All memcached servers error-limited')
    def _return_conn(self, server, fp, sock):
        """Returns a server connection to the pool."""
@ -404,6 +412,14 @@ class MemcacheRing(object):
        elif not isinstance(value, bytes):
            value = str(value).encode('utf-8')
        if 0 <= self.item_size_warning_threshold <= len(value):
            self.logger.warning(
                "Item size larger than warning threshold: "
                "%d (%s) >= %d (%s)", len(value),
                human_readable(len(value)),
                self.item_size_warning_threshold,
                human_readable(self.item_size_warning_threshold))
        for (server, fp, sock) in self._get_conns(cmd):
            conn_start_time = tm.time()
            try:
@ -414,17 +430,7 @@ class MemcacheRing(object):
                    if msg != b'STORED':
                        if not six.PY2:
                            msg = msg.decode('ascii')
-                        self.logger.error(
+                        raise MemcacheConnectionError('failed set: %s' % msg)
                            "Error setting value in memcached: "
                            "%(server)s: %(msg)s",
                            {'server': server, 'msg': msg})
                    if 0 <= self.item_size_warning_threshold <= len(value):
                        self.logger.warning(
                            "Item size larger than warning threshold: "
                            "%d (%s) >= %d (%s)", len(value),
                            human_readable(len(value)),
                            self.item_size_warning_threshold,
                            human_readable(self.item_size_warning_threshold))
                    self._return_conn(server, fp, sock)
                    return
            except (Exception, Timeout) as e:
--- a/swift/proxy/controllers/obj.py
+++ b/swift/proxy/controllers/obj.py
@ -345,6 +345,25 @@ class BaseObjectController(Controller):
        # there will be only one shard range in the list if any
        return shard_ranges[0] if shard_ranges else None
    def _cache_update_namespaces(self, memcache, cache_key, namespaces):
        if not memcache:
            return
        self.logger.info(
            'Caching updating shards for %s (%d shards)',
            cache_key, len(namespaces.bounds))
        try:
            memcache.set(
                cache_key, namespaces.bounds,
                time=self.app.recheck_updating_shard_ranges,
                raise_on_error=True)
            cache_state = 'set'
        except MemcacheConnectionError:
            cache_state = 'set_error'
        finally:
            record_cache_op_metrics(self.logger, self.server_type.lower(),
                                    'shard_updating', cache_state, None)
    def _get_update_shard(self, req, account, container, obj):
        """
        Find the appropriate shard range for an object update.
@ -386,16 +405,9 @@ class BaseObjectController(Controller):
            if shard_ranges:
                # only store the list of namespace lower bounds and names into
                # infocache and memcache.
-                cached_namespaces = NamespaceBoundList.parse(
+                namespaces = NamespaceBoundList.parse(shard_ranges)
-                    shard_ranges)
+                infocache[cache_key] = namespaces
-                infocache[cache_key] = cached_namespaces
+                self._cache_update_namespaces(memcache, cache_key, namespaces)
                if memcache:
                    self.logger.info(
                        'Caching updating shards for %s (%d shards)',
                        cache_key, len(cached_namespaces.bounds))
                    memcache.set(
                        cache_key, cached_namespaces.bounds,
                        time=self.app.recheck_updating_shard_ranges)
            update_shard = find_namespace(obj, shard_ranges or [])
        record_cache_op_metrics(
            self.logger, self.server_type.lower(), 'shard_updating',
--- a/test/unit/common/test_memcached.py
+++ b/test/unit/common/test_memcached.py
@ -410,16 +410,50 @@ class TestMemcached(unittest.TestCase):
        self.assertAlmostEqual(float(cache_timeout), esttimeout, delta=1)
    def test_set_error(self):
-        memcache_client = memcached.MemcacheRing(['1.2.3.4:11211'],
+        memcache_client = memcached.MemcacheRing(
-                                                 logger=self.logger)
+            ['1.2.3.4:11211'], logger=self.logger,
            item_size_warning_threshold=1)
        mock = MockMemcached()
        memcache_client._client_cache['1.2.3.4:11211'] = MockedMemcachePool(
            [(mock, mock)] * 2)
        now = time.time()
        with patch('time.time', return_value=now):
            memcache_client.set('too-big', [1, 2, 3])
        self.assertEqual(
            self.logger.get_lines_for_level('error'),
-            ['Error setting value in memcached: 1.2.3.4:11211: '
+            ['Error talking to memcached: 1.2.3.4:11211: '
-             'SERVER_ERROR object too large for cache'])
+             'with key_prefix too-big, method set, time_spent 0.0, '
             'failed set: SERVER_ERROR object too large for cache'])
        warning_lines = self.logger.get_lines_for_level('warning')
        self.assertEqual(1, len(warning_lines))
        self.assertIn('Item size larger than warning threshold',
                      warning_lines[0])
        self.assertTrue(mock.close_called)
    def test_set_error_raise_on_error(self):
        memcache_client = memcached.MemcacheRing(
            ['1.2.3.4:11211'], logger=self.logger,
            item_size_warning_threshold=1)
        mock = MockMemcached()
        memcache_client._client_cache[
            '1.2.3.4:11211'] = MockedMemcachePool(
            [(mock, mock)] * 2)
        now = time.time()
        with self.assertRaises(MemcacheConnectionError) as cm:
            with patch('time.time', return_value=now):
                memcache_client.set('too-big', [1, 2, 3], raise_on_error=True)
        self.assertIn("No memcached connections succeeded", str(cm.exception))
        self.assertEqual(
            self.logger.get_lines_for_level('error'),
            ['Error talking to memcached: 1.2.3.4:11211: '
             'with key_prefix too-big, method set, time_spent 0.0, '
             'failed set: SERVER_ERROR object too large for cache'])
        warning_lines = self.logger.get_lines_for_level('warning')
        self.assertEqual(1, len(warning_lines))
        self.assertIn('Item size larger than warning threshold',
                      warning_lines[0])
        self.assertTrue(mock.close_called)
    def test_get_failed_connection_mid_request(self):
        memcache_client = memcached.MemcacheRing(['1.2.3.4:11211'],
@ -620,7 +654,8 @@ class TestMemcached(unittest.TestCase):
            'with key_prefix some_key, method set, time_spent 0.0, '
            '[Errno 32] Broken pipe',
            'Error limiting server 1.2.3.4:11211',
-            'All memcached servers error-limited',
+            'Error connecting to memcached: ALL: with key_prefix some_key, '
            'method set: All memcached servers error-limited',
        ])
        self.logger.clear()
@ -628,14 +663,16 @@ class TestMemcached(unittest.TestCase):
        for _ in range(12):
            memcache_client.set('some_key', [1, 2, 3])
        self.assertEqual(self.logger.get_lines_for_level('error'), [
-            'All memcached servers error-limited',
+            'Error connecting to memcached: ALL: with key_prefix some_key, '
            'method set: All memcached servers error-limited',
        ] * 12)
        self.logger.clear()
        # and get()s are all a "cache miss"
        self.assertIsNone(memcache_client.get('some_key'))
        self.assertEqual(self.logger.get_lines_for_level('error'), [
-            'All memcached servers error-limited',
+            'Error connecting to memcached: ALL: with key_prefix some_key, '
            'method get: All memcached servers error-limited',
        ])
    def test_error_disabled(self):
@ -752,7 +789,8 @@ class TestMemcached(unittest.TestCase):
            'with key_prefix some_key, method set, time_spent 0.0, '
            '[Errno 32] Broken pipe',
            'Error limiting server 1.2.3.5:11211',
-            'All memcached servers error-limited',
+            'Error connecting to memcached: ALL: with key_prefix some_key, '
            'method set: All memcached servers error-limited',
        ])
        # with default error_limit_time of 60, one call per 6 secs, error limit
@ -776,8 +814,8 @@ class TestMemcached(unittest.TestCase):
            'with key_prefix some_key, method set, time_spent 0.0, '
            '[Errno 32] Broken pipe',
            'Error limiting server 1.2.3.5:11211',
-            'All memcached servers error-limited',
+            'Error connecting to memcached: ALL: with key_prefix some_key, '
-        ])
+            'method set: All memcached servers error-limited'])
        # with error_limit_time of 70, one call per 6 secs, error_limit_count
        # of 11, 13th call triggers error limit
@ -791,8 +829,8 @@ class TestMemcached(unittest.TestCase):
            'with key_prefix some_key, method set, time_spent 0.0, '
            '[Errno 32] Broken pipe',
            'Error limiting server 1.2.3.5:11211',
-            'All memcached servers error-limited',
+            'Error connecting to memcached: ALL: with key_prefix some_key, '
-        ])
+            'method set: All memcached servers error-limited'])
    def test_delete(self):
        memcache_client = memcached.MemcacheRing(['1.2.3.4:11211'],
--- a/test/unit/proxy/test_server.py
+++ b/test/unit/proxy/test_server.py
@ -4431,7 +4431,8 @@ class TestReplicatedObjectController(
                              'account.info.infocache.hit': 2,
                              'container.info.cache.miss.200': 1,
                              'container.info.infocache.hit': 1,
-                              'object.shard_updating.cache.miss.200': 1},
+                              'object.shard_updating.cache.miss.200': 1,
                              'object.shard_updating.cache.set': 1},
                             stats)
            self.assertEqual([], self.app.logger.log_dict['set_statsd_prefix'])
            info_lines = self.logger.get_lines_for_level('info')
@ -4795,7 +4796,8 @@ class TestReplicatedObjectController(
                              'object.shard_updating.cache.hit': 1,
                              'container.info.cache.hit': 1,
                              'account.info.cache.hit': 1,
-                              'object.shard_updating.cache.skip.200': 1},
+                              'object.shard_updating.cache.skip.200': 1,
                              'object.shard_updating.cache.set': 1},
                             stats)
            # verify statsd prefix is not mutated
            self.assertEqual([], self.app.logger.log_dict['set_statsd_prefix'])
@ -4865,7 +4867,99 @@ class TestReplicatedObjectController(
                'container.info.infocache.hit': 3,
                'object.shard_updating.cache.skip.200': 1,
                'object.shard_updating.cache.hit': 1,
-                'object.shard_updating.cache.error.200': 1})
+                'object.shard_updating.cache.error.200': 1,
                'object.shard_updating.cache.set': 2
            })
        do_test('POST', 'sharding')
        do_test('POST', 'sharded')
        do_test('DELETE', 'sharding')
        do_test('DELETE', 'sharded')
        do_test('PUT', 'sharding')
        do_test('PUT', 'sharded')
    @patch_policies([
        StoragePolicy(0, 'zero', is_default=True, object_ring=FakeRing()),
        StoragePolicy(1, 'one', object_ring=FakeRing()),
    ])
    def test_backend_headers_update_shard_container_cache_set_error(self):
        # verify that backend container update is directed to the shard
        # container despite memcache set error
        # reset the router post patch_policies
        self.app.obj_controller_router = proxy_server.ObjectControllerRouter()
        self.app.sort_nodes = lambda nodes, *args, **kwargs: nodes
        self.app.recheck_updating_shard_ranges = 3600
        self.app.container_updating_shard_ranges_skip_cache = 0.001
        def do_test(method, sharding_state):
            self.app.logger.clear()  # clean capture state
            # simulate memcache error when setting updating namespaces;
            # expect 4 memcache sets: account info, container info, container
            # info again from namespaces GET subrequest, namespaces
            cache = FakeMemcache(error_on_set=[False, False, False, True])
            req = Request.blank(
                '/v1/a/c/o', {'swift.cache': cache},
                method=method, body='', headers={'Content-Type': 'text/plain'})
            # acct HEAD, cont HEAD, cont shard GET, obj POSTs
            status_codes = (200, 200, 200, 202, 202, 202)
            resp_headers = {'X-Backend-Storage-Policy-Index': 1,
                            'x-backend-sharding-state': sharding_state,
                            'X-Backend-Record-Type': 'shard'}
            shard_ranges = [
                utils.ShardRange(
                    '.shards_a/c_not_used', utils.Timestamp.now(), '', 'l'),
                utils.ShardRange(
                    '.shards_a/c_shard', utils.Timestamp.now(), 'l', 'u'),
                utils.ShardRange(
                    '.shards_a/c_nope', utils.Timestamp.now(), 'u', ''),
            ]
            body = json.dumps([
                dict(shard_range)
                for shard_range in shard_ranges]).encode('ascii')
            with mock.patch('random.random', return_value=0), \
                mocked_http_conn(*status_codes, headers=resp_headers,
                                 body=body) as fake_conn:
                resp = req.get_response(self.app)
            self.assertEqual(resp.status_int, 202)
            stats = self.app.logger.statsd_client.get_increment_counts()
            self.assertEqual({'account.info.cache.miss.200': 1,
                              'account.info.infocache.hit': 2,
                              'container.info.cache.miss.200': 1,
                              'container.info.infocache.hit': 1,
                              'object.shard_updating.cache.skip.200': 1,
                              'object.shard_updating.cache.set_error': 1},
                             stats)
            # verify statsd prefix is not mutated
            self.assertEqual([], self.app.logger.log_dict['set_statsd_prefix'])
            # sanity check: namespaces not in cache
            cache_key = 'shard-updating-v2/a/c'
            self.assertNotIn(cache_key, req.environ['swift.cache'].store)
            # make sure backend requests included expected container headers
            container_headers = {}
            for backend_request in fake_conn.requests[3:]:
                req_headers = backend_request['headers']
                device = req_headers['x-container-device']
                container_headers[device] = req_headers['x-container-host']
                expectations = {
                    'method': method,
                    'path': '/0/a/c/o',
                    'headers': {
                        'X-Container-Partition': '0',
                        'Host': 'localhost:80',
                        'Referer': '%s http://localhost/v1/a/c/o' % method,
                        'X-Backend-Storage-Policy-Index': '1',
                        'X-Backend-Quoted-Container-Path': shard_ranges[1].name
                    },
                }
                self._check_request(backend_request, **expectations)
            expected = {}
            for i, device in enumerate(['sda', 'sdb', 'sdc']):
                expected[device] = '10.0.0.%d:100%d' % (i, i)
            self.assertEqual(container_headers, expected)
        do_test('POST', 'sharding')
        do_test('POST', 'sharded')