From cadc000f32ac2c89f5fd67f876ef2503b2f84892 Mon Sep 17 00:00:00 2001 From: Dantali0n Date: Thu, 11 Jul 2019 22:07:20 +0200 Subject: [PATCH] Add call_retry for ModelBuilder for error recovery Add call_retry method for ModelBuilder classes along with configuration options. This allows ModelBuilder classes to reattempt any failed calls to external services such as Nova or Ironic. Change-Id: Ided697adebed957e5ff13b4c6b5b06c816f81c4a --- .../api-call-retry-fef741ac684c58dd.yaml | 9 +++++ watcher/conf/collector.py | 7 ++++ .../decision_engine/model/collector/base.py | 38 +++++++++++++++++++ .../decision_engine/model/collector/cinder.py | 11 +++--- .../decision_engine/model/collector/ironic.py | 2 +- .../decision_engine/model/collector/nova.py | 18 ++++----- 6 files changed, 70 insertions(+), 15 deletions(-) create mode 100644 releasenotes/notes/api-call-retry-fef741ac684c58dd.yaml diff --git a/releasenotes/notes/api-call-retry-fef741ac684c58dd.yaml b/releasenotes/notes/api-call-retry-fef741ac684c58dd.yaml new file mode 100644 index 000000000..7d5d4d774 --- /dev/null +++ b/releasenotes/notes/api-call-retry-fef741ac684c58dd.yaml @@ -0,0 +1,9 @@ +--- +features: + - | + API calls while building the Compute data model will be retried upon + failure. The amount of failures allowed before giving up and the time before + reattempting are configurable. The `api_call_retries` and + `api_query_timeout` parameters in the `[collector]` group can be used to + adjust these paremeters. 10 retries with a 1 second time in between + reattempts is the default. diff --git a/watcher/conf/collector.py b/watcher/conf/collector.py index a585f6af9..126657f02 100644 --- a/watcher/conf/collector.py +++ b/watcher/conf/collector.py @@ -35,6 +35,13 @@ Supported in-tree collectors include: Custom data model collector plugins can be defined with the ``watcher_cluster_data_model_collectors`` extension point. """), + cfg.IntOpt('api_call_retries', + default=10, + help="Number of retries before giving up on external service " + "calls."), + cfg.IntOpt('api_query_timeout', + default=1, + help="Time before retry after failed call to external service.") ] diff --git a/watcher/decision_engine/model/collector/base.py b/watcher/decision_engine/model/collector/base.py index 9016054e7..76e9307e3 100644 --- a/watcher/decision_engine/model/collector/base.py +++ b/watcher/decision_engine/model/collector/base.py @@ -106,6 +106,7 @@ strategies. import abc import copy import threading +import time from oslo_config import cfg from oslo_log import log @@ -116,6 +117,7 @@ from watcher.common.loader import loadable from watcher.decision_engine.model import model_root LOG = log.getLogger(__name__) +CONF = cfg.CONF @six.add_metaclass(abc.ABCMeta) @@ -194,6 +196,42 @@ class BaseClusterDataModelCollector(loadable.LoadableSingleton): class BaseModelBuilder(object): + def call_retry(self, f, *args, **kwargs): + """Attempts to call external service + + Attempts to access data from the external service and handles + exceptions. The retrieval should be retried in accordance + to the value of api_call_retries + :param f: The method that performs the actual querying for metrics + :param args: Array of arguments supplied to the method + :param kwargs: The amount of arguments supplied to the method + :return: The value as retrieved from the external service + """ + + num_retries = CONF.collector.api_call_retries + timeout = CONF.collector.api_query_timeout + + for i in range(num_retries): + try: + return f(*args, **kwargs) + except Exception as e: + LOG.exception(e) + self.call_retry_reset(e) + LOG.warning("Retry {0} of {1}, error while calling service " + "retry in {2} seconds".format(i+1, num_retries, + timeout)) + time.sleep(timeout) + raise + + @abc.abstractmethod + def call_retry_reset(self, exc): + """Attempt to recover after encountering an error + + Recover from errors while calling external services, the exception + can be used to make a better decision on how to best recover. + """ + pass + @abc.abstractmethod def execute(self, model_scope): """Build the cluster data model limited to the scope and return it diff --git a/watcher/decision_engine/model/collector/cinder.py b/watcher/decision_engine/model/collector/cinder.py index f6a774b8c..5a24389de 100644 --- a/watcher/decision_engine/model/collector/cinder.py +++ b/watcher/decision_engine/model/collector/cinder.py @@ -176,9 +176,10 @@ class CinderModelBuilder(base.BaseModelBuilder): This includes components which represent actual infrastructure hardware. """ - for snode in self.cinder_helper.get_storage_node_list(): + for snode in self.call_retry( + self.cinder_helper.get_storage_node_list): self.add_storage_node(snode) - for pool in self.cinder_helper.get_storage_pool_list(): + for pool in self.call_retry(self.cinder_helper.get_storage_pool_list): pool = self._build_storage_pool(pool) self.model.add_pool(pool) storage_name = getattr(pool, 'name') @@ -213,8 +214,8 @@ class CinderModelBuilder(base.BaseModelBuilder): except IndexError: pass - volume_type = self.cinder_helper.get_volume_type_by_backendname( - backend) + volume_type = self.call_retry( + self.cinder_helper.get_volume_type_by_backendname, backend) # build up the storage node. node_attributes = { @@ -262,7 +263,7 @@ class CinderModelBuilder(base.BaseModelBuilder): self._add_virtual_storage() def _add_virtual_storage(self): - volumes = self.cinder_helper.get_volume_list() + volumes = self.call_retry(self.cinder_helper.get_volume_list) for vol in volumes: volume = self._build_volume_node(vol) self.model.add_volume(volume) diff --git a/watcher/decision_engine/model/collector/ironic.py b/watcher/decision_engine/model/collector/ironic.py index 4ac3ec09c..ecce6e5ac 100644 --- a/watcher/decision_engine/model/collector/ironic.py +++ b/watcher/decision_engine/model/collector/ironic.py @@ -105,6 +105,6 @@ class BareMetalModelBuilder(base.BaseModelBuilder): def execute(self, model_scope): # TODO(Dantali0n): Use scope to limit size of model - for node in self.ironic_helper.get_ironic_node_list(): + for node in self.call_retry(self.ironic_helper.get_ironic_node_list): self.add_ironic_node(node) return self.model diff --git a/watcher/decision_engine/model/collector/nova.py b/watcher/decision_engine/model/collector/nova.py index fc3ec6150..8802f7336 100644 --- a/watcher/decision_engine/model/collector/nova.py +++ b/watcher/decision_engine/model/collector/nova.py @@ -211,7 +211,7 @@ class NovaModelBuilder(base.BaseModelBuilder): self.nova_helper = nova_helper.NovaHelper(osc=self.osc) def _collect_aggregates(self, host_aggregates, _nodes): - aggregate_list = self.nova_helper.get_aggregate_list() + aggregate_list = self.call_retry(f=self.nova_helper.get_aggregate_list) aggregate_ids = [aggregate['id'] for aggregate in host_aggregates if 'id' in aggregate] aggregate_names = [aggregate['name'] for aggregate @@ -226,7 +226,7 @@ class NovaModelBuilder(base.BaseModelBuilder): _nodes.update(aggregate.hosts) def _collect_zones(self, availability_zones, _nodes): - service_list = self.nova_helper.get_service_list() + service_list = self.call_retry(f=self.nova_helper.get_service_list) zone_names = [zone['name'] for zone in availability_zones] include_all_nodes = False @@ -252,14 +252,15 @@ class NovaModelBuilder(base.BaseModelBuilder): if not compute_nodes: self.no_model_scope_flag = True - all_nodes = self.nova_helper.get_compute_node_list() + all_nodes = self.call_retry( + f=self.nova_helper.get_compute_node_list) compute_nodes = set( [node.hypervisor_hostname for node in all_nodes]) LOG.debug("compute nodes: %s", compute_nodes) for node_name in compute_nodes: - cnode = self.nova_helper.get_compute_node_by_name(node_name, - servers=True, - detailed=True) + cnode = self.call_retry( + self.nova_helper.get_compute_node_by_name, + node_name, servers=True, detailed=True) if cnode: node_info = cnode[0] # filter out baremetal node @@ -339,9 +340,8 @@ class NovaModelBuilder(base.BaseModelBuilder): # compute API. If we need to request more than 1000 servers, # we can set limit=-1. For details, please see: # https://bugs.launchpad.net/watcher/+bug/1834679 - instances = self.nova_helper.get_instance_list( - filters=filters, - limit=limit) + instances = self.call_retry(f=self.nova_helper.get_instance_list, + filters=filters, limit=limit) for inst in instances: # Add Node instance = self._build_instance_node(inst)