Add support for cluster restart

Implement cluster rolling restart strategy. Add support for Cassandra and PXC. Add some missing cluster upgrade infrastructure. Implements: blueprint cluster-restart Co-Authored-By: Petr Malik <pmalik@tesora.com> Co-Authored-By: Peter Stachowski <peter@tesora.com> Change-Id: I21e654a8dd2dc6a74aa095604f78db4e96c70d64
2016-11-01 17:44:55 -04:00 · 2016-11-01 17:44:55 -04:00 · 6e7fa196dc
commit 6e7fa196dc
parent 6a917bab58
17 changed files with 298 additions and 8 deletions
--- a/releasenotes/notes/cluster_restart-bb5abb7372131ee0.yaml
+++ b/releasenotes/notes/cluster_restart-bb5abb7372131ee0.yaml
@ -0,0 +1,4 @@
+---
+features:
+  - |
+    Add support for cluster restart.
--- a/trove/cluster/models.py
+++ b/trove/cluster/models.py
@ -21,8 +21,9 @@ from trove.cluster.tasks import ClusterTasks
 from trove.common import cfg
 from trove.common import exception
 from trove.common.i18n import _
-from trove.common.notification import DBaaSClusterGrow, DBaaSClusterShrink
-from trove.common.notification import DBaaSClusterResetStatus
+from trove.common.notification import (DBaaSClusterGrow, DBaaSClusterShrink,
+                                       DBaaSClusterResetStatus,
+                                       DBaaSClusterRestart)
 from trove.common.notification import DBaaSClusterUpgrade
 from trove.common.notification import StartNotification
 from trove.common import remote
@ -316,6 +317,11 @@ class Cluster(object):
            with StartNotification(context, cluster_id=self.id):
                return self.reset_status()

+        elif action == 'restart':
+            context.notification = DBaaSClusterRestart(context, request=req)
+            with StartNotification(context, cluster_id=self.id):
+                return self.restart()
+
        elif action == 'upgrade':
            context.notification = DBaaSClusterUpgrade(context, request=req)
            dv_id = param['datastore_version']
@ -332,6 +338,41 @@ class Cluster(object):
    def shrink(self, instance_ids):
        raise exception.BadRequest(_("Action 'shrink' not supported"))

+    def rolling_restart(self):
+        self.validate_cluster_available()
+        self.db_info.update(task_status=ClusterTasks.RESTARTING_CLUSTER)
+        try:
+            cluster_id = self.db_info.id
+            task_api.load(self.context, self.ds_version.manager
+                          ).restart_cluster(cluster_id)
+        except Exception:
+            self.db_info.update(task_status=ClusterTasks.NONE)
+            raise
+
+        return self.__class__(self.context, self.db_info,
+                              self.ds, self.ds_version)
+
+    def rolling_upgrade(self, datastore_version):
+        """Upgrades a cluster to a new datastore version."""
+        LOG.debug("Upgrading cluster %s." % self.id)
+
+        self.validate_cluster_available()
+        self.db_info.update(task_status=ClusterTasks.UPGRADING_CLUSTER)
+        try:
+            cluster_id = self.db_info.id
+            ds_ver_id = datastore_version.id
+            task_api.load(self.context, self.ds_version.manager
+                          ).upgrade_cluster(cluster_id, ds_ver_id)
+        except Exception:
+            self.db_info.update(task_status=ClusterTasks.NONE)
+            raise
+
+        return self.__class__(self.context, self.db_info,
+                              self.ds, self.ds_version)
+
+    def restart(self):
+        raise exception.BadRequest(_("Action 'restart' not supported"))
+
    def upgrade(self, datastore_version):
        raise exception.BadRequest(_("Action 'upgrade' not supported"))

--- a/trove/cluster/tasks.py
+++ b/trove/cluster/tasks.py
@ -69,6 +69,10 @@ class ClusterTasks(object):
        0x05, 'GROWING_CLUSTER', 'Increasing the size of the cluster.')
    SHRINKING_CLUSTER = ClusterTask(
        0x06, 'SHRINKING_CLUSTER', 'Decreasing the size of the cluster.')
+    UPGRADING_CLUSTER = ClusterTask(
+        0x07, 'UPGRADING_CLUSTER', 'Upgrading the cluster to new version.')
+    RESTARTING_CLUSTER = ClusterTask(
+        0x08, 'RESTARTING_CLUSTER', 'Restarting the cluster.')


 # Dissuade further additions at run-time.
--- a/trove/common/cfg.py
+++ b/trove/common/cfg.py
@ -938,6 +938,26 @@ cassandra_opts = [
               help='Character length of generated passwords.',
               deprecated_name='default_password_length',
               deprecated_group='DEFAULT'),
+    cfg.BoolOpt('enable_cluster_instance_backup',
+                default=False,
+                help='Allows backup of single instance in the cluster.'),
+    cfg.BoolOpt('enable_saslauthd', default=False,
+                help='Enable the saslauth daemon.'),
+    cfg.StrOpt('user_controller',
+               default='trove.extensions.cassandra.service.'
+               'CassandraUserController',
+               help='User controller implementation.'),
+    cfg.StrOpt('database_controller',
+               default='trove.extensions.cassandra.service.'
+               'CassandraDatabaseController',
+               help='Database controller implementation.'),
+    cfg.StrOpt('user_access_controller',
+               default='trove.extensions.cassandra.service.'
+               'CassandraUserAccessController',
+               help='User access controller implementation.'),
+    cfg.IntOpt('node_sync_time', default=60,
+               help='Time (in seconds) given to a node after a state change '
+               'to finish rejoining the cluster.'),
 ]

 # Couchbase
--- a/trove/common/notification.py
+++ b/trove/common/notification.py
@ -564,6 +564,15 @@ class DBaaSClusterCreate(DBaaSAPINotification):
        return ['cluster_id']


+class DBaaSClusterRestart(DBaaSAPINotification):
+
+    def event_type(self):
+        return 'cluster_restart'
+
+    def required_start_traits(self):
+        return ['cluster_id']
+
+
 class DBaaSClusterUpgrade(DBaaSAPINotification):

    @abc.abstractmethod
--- a/trove/common/strategies/cluster/experimental/cassandra/api.py
+++ b/trove/common/strategies/cluster/experimental/cassandra/api.py
@ -206,6 +206,12 @@ class CassandraCluster(models.Cluster):

        return CassandraCluster(context, db_info, datastore, datastore_version)

+    def restart(self):
+        self.rolling_restart()
+
+    def upgrade(self, datastore_version):
+        self.rolling_upgrade(datastore_version)
+

 class CassandraClusterView(ClusterView):

--- a/trove/common/strategies/cluster/experimental/cassandra/taskmanager.py
+++ b/trove/common/strategies/cluster/experimental/cassandra/taskmanager.py
@ -341,6 +341,13 @@ class CassandraClusterTasks(task_models.ClusterTasks):

        LOG.debug("End shrink_cluster for id: %s." % cluster_id)

+    def restart_cluster(self, context, cluster_id):
+        self.rolling_restart_cluster(
+            context, cluster_id, delay_sec=CONF.cassandra.node_sync_time)
+
+    def upgrade_cluster(self, context, cluster_id, datastore_version):
+        self.rolling_upgrade_cluster(context, cluster_id, datastore_version)
+

 class CassandraTaskManagerAPI(task_api.API):
    pass
--- a/trove/common/strategies/cluster/experimental/galera_common/api.py
+++ b/trove/common/strategies/cluster/experimental/galera_common/api.py
@ -197,6 +197,12 @@ class GaleraCommonCluster(cluster_models.Cluster):
        return self.__class__(self.context, self.db_info,
                              self.ds, self.ds_version)

+    def restart(self):
+        self.rolling_restart()
+
+    def upgrade(self, datastore_version):
+        self.rolling_upgrade(datastore_version)
+

 class GaleraCommonClusterView(ClusterView):

--- a/trove/common/strategies/cluster/experimental/galera_common/taskmanager.py
+++ b/trove/common/strategies/cluster/experimental/galera_common/taskmanager.py
@ -325,3 +325,9 @@ class GaleraCommonClusterTasks(task_models.ClusterTasks):
            timeout.cancel()

        LOG.debug("End shrink_cluster for id: %s." % cluster_id)
+
+    def restart_cluster(self, context, cluster_id):
+        self.rolling_restart_cluster(context, cluster_id)
+
+    def upgrade_cluster(self, context, cluster_id, datastore_version):
+        self.rolling_upgrade_cluster(context, cluster_id, datastore_version)
--- a/trove/instance/tasks.py
+++ b/trove/instance/tasks.py
@ -114,6 +114,9 @@ class InstanceTasks(object):
    SHRINKING_ERROR = InstanceTask(0x58, 'SHRINKING',
                                         'Shrinking Cluster Error.',
                                         is_error=True)
+    UPGRADING_ERROR = InstanceTask(0x59, 'UPGRADING',
+                                         'Upgrading Cluster Error.',
+                                         is_error=True)
    UPGRADING = InstanceTask(0x59, 'UPGRADING', 'Upgrading the instance.')

 # Dissuade further additions at run-time.
--- a/trove/taskmanager/api.py
+++ b/trove/taskmanager/api.py
@ -251,6 +251,22 @@ class API(object):
        cctxt.cast(self.context, "upgrade", instance_id=instance_id,
                   datastore_version_id=datastore_version_id)

+    def restart_cluster(self, cluster_id):
+        LOG.debug("Making async call to restart cluster %s " % cluster_id)
+        version = self.API_BASE_VERSION
+
+        cctxt = self.client.prepare(version=version)
+        cctxt.cast(self.context, "restart_cluster", cluster_id=cluster_id)
+
+    def upgrade_cluster(self, cluster_id, datastore_version_id):
+        LOG.debug("Making async call to upgrade guest to datastore "
+                  "version %s " % datastore_version_id)
+        version = self.API_BASE_VERSION
+
+        cctxt = self.client.prepare(version=version)
+        cctxt.cast(self.context, "upgrade_cluster", cluster_id=cluster_id,
+                   datastore_version_id=datastore_version_id)
+

 def load(context, manager=None):
    if manager:
--- a/trove/taskmanager/manager.py
+++ b/trove/taskmanager/manager.py
@ -371,7 +371,7 @@ class Manager(periodic_task.PeriodicTasks):
                        cluster_config, volume_type, modules, locality):
        with EndNotification(context,
                             instance_id=(instance_id[0]
-                                          if type(instance_id) is list
+                                          if isinstance(instance_id, list)
                                          else instance_id)):
            self._create_instance(context, instance_id, name, flavor,
                                  image_id, databases, users,
@ -409,6 +409,15 @@ class Manager(periodic_task.PeriodicTasks):
        cluster_tasks = models.load_cluster_tasks(context, cluster_id)
        cluster_tasks.shrink_cluster(context, cluster_id, instance_ids)

+    def restart_cluster(self, context, cluster_id):
+        cluster_tasks = models.load_cluster_tasks(context, cluster_id)
+        cluster_tasks.restart_cluster(context, cluster_id)
+
+    def upgrade_cluster(self, context, cluster_id, datastore_version_id):
+        datastore_version = DatastoreVersion.load_by_uuid(datastore_version_id)
+        cluster_tasks = models.load_cluster_tasks(context, cluster_id)
+        cluster_tasks.upgrade_cluster(context, cluster_id, datastore_version)
+
    def delete_cluster(self, context, cluster_id):
        with EndNotification(context):
            cluster_tasks = models.load_cluster_tasks(context, cluster_id)
--- a/trove/taskmanager/models.py
+++ b/trove/taskmanager/models.py
@ -13,10 +13,12 @@
 #    under the License.

 import os.path
+import time
 import traceback

 from cinderclient import exceptions as cinder_exceptions
 from eventlet import greenthread
+from eventlet.timeout import Timeout
 from heatclient import exc as heat_exceptions
 from novaclient import exceptions as nova_exceptions
 from oslo_log import log as logging
@ -45,6 +47,10 @@ from trove.common.i18n import _
 from trove.common import instance as rd_instance
 from trove.common.instance import ServiceStatuses
 from trove.common.notification import (
+    DBaaSInstanceRestart,
+    DBaaSInstanceUpgrade,
+    EndNotification,
+    StartNotification,
    TroveInstanceCreate,
    TroveInstanceModifyVolume,
    TroveInstanceModifyFlavor,
@ -316,6 +322,88 @@ class ClusterTasks(Cluster):
        cluster.save()
        LOG.debug("end delete_cluster for id: %s" % cluster_id)

+    def rolling_restart_cluster(self, context, cluster_id, delay_sec=0):
+        LOG.debug("Begin rolling cluster restart for id: %s" % cluster_id)
+
+        def _restart_cluster_instance(instance):
+            LOG.debug("Restarting instance with id: %s" % instance.id)
+            context.notification = (
+                DBaaSInstanceRestart(context, **request_info))
+            with StartNotification(context, instance_id=instance.id):
+                with EndNotification(context):
+                    instance.update_db(task_status=InstanceTasks.REBOOTING)
+                    instance.restart()
+
+        timeout = Timeout(CONF.cluster_usage_timeout)
+        cluster_notification = context.notification
+        request_info = cluster_notification.serialize(context)
+        try:
+            node_db_inst = DBInstance.find_all(cluster_id=cluster_id).all()
+            for index, db_inst in enumerate(node_db_inst):
+                if index > 0:
+                    LOG.debug(
+                        "Waiting (%ds) for restarted nodes to rejoin the "
+                        "cluster before proceeding." % delay_sec)
+                    time.sleep(delay_sec)
+                instance = BuiltInstanceTasks.load(context, db_inst.id)
+                _restart_cluster_instance(instance)
+        except Timeout as t:
+            if t is not timeout:
+                raise  # not my timeout
+            LOG.exception(_("Timeout for restarting cluster."))
+            raise
+        except Exception:
+            LOG.exception(_("Error restarting cluster.") % cluster_id)
+            raise
+        finally:
+            context.notification = cluster_notification
+            timeout.cancel()
+            self.reset_task()
+
+        LOG.debug("End rolling restart for id: %s." % cluster_id)
+
+    def rolling_upgrade_cluster(self, context, cluster_id, datastore_version):
+        LOG.debug("Begin rolling cluster upgrade for id: %s." % cluster_id)
+
+        def _upgrade_cluster_instance(instance):
+            LOG.debug("Upgrading instance with id: %s." % instance.id)
+            context.notification = (
+                DBaaSInstanceUpgrade(context, **request_info))
+            with StartNotification(
+                    context, instance_id=instance.id,
+                    datastore_version_id=datastore_version.id):
+                with EndNotification(context):
+                    instance.update_db(
+                        datastore_version_id=datastore_version.id,
+                        task_status=InstanceTasks.UPGRADING)
+                    instance.upgrade(datastore_version)
+
+        timeout = Timeout(CONF.cluster_usage_timeout)
+        cluster_notification = context.notification
+        request_info = cluster_notification.serialize(context)
+        try:
+            for db_inst in DBInstance.find_all(cluster_id=cluster_id).all():
+                instance = BuiltInstanceTasks.load(
+                    context, db_inst.id)
+                _upgrade_cluster_instance(instance)
+
+            self.reset_task()
+        except Timeout as t:
+            if t is not timeout:
+                raise  # not my timeout
+            LOG.exception(_("Timeout for upgrading cluster."))
+            self.update_statuses_on_failure(
+                cluster_id, status=InstanceTasks.UPGRADING_ERROR)
+        except Exception:
+            LOG.exception(_("Error upgrading cluster %s.") % cluster_id)
+            self.update_statuses_on_failure(
+                cluster_id, status=InstanceTasks.UPGRADING_ERROR)
+        finally:
+            context.notification = cluster_notification
+            timeout.cancel()
+
+        LOG.debug("End upgrade_cluster for id: %s." % cluster_id)
+

 class FreshInstanceTasks(FreshInstance, NotifyMixin, ConfigurationMixin):

--- a/trove/tests/int_tests.py
+++ b/trove/tests/int_tests.py
@ -162,6 +162,9 @@ cluster_root_groups.extend([groups.CLUSTER_ACTIONS_ROOT_ENABLE])
 cluster_root_actions_groups = list(cluster_actions_groups)
 cluster_root_actions_groups.extend([groups.CLUSTER_ACTIONS_ROOT_ACTIONS])

+cluster_restart_groups = list(cluster_create_groups)
+cluster_restart_groups.extend([groups.CLUSTER_ACTIONS_RESTART_WAIT])
+
 cluster_upgrade_groups = list(cluster_create_groups)
 cluster_upgrade_groups.extend([groups.CLUSTER_UPGRADE_WAIT])

@ -247,6 +250,7 @@ register(["cluster"], cluster_actions_groups)
 register(["cluster_actions"], cluster_actions_groups)
 register(["cluster_create"], cluster_create_groups)
 register(["cluster_negative_actions"], cluster_negative_actions_groups)
+register(["cluster_restart"], cluster_restart_groups)
 register(["cluster_root"], cluster_root_groups)
 register(["cluster_root_actions"], cluster_root_actions_groups)
 register(["cluster_upgrade"], cluster_upgrade_groups)
--- a/trove/tests/scenario/groups/init.py
+++ b/trove/tests/scenario/groups/init.py
@ -61,6 +61,8 @@ CLUSTER_ACTIONS_GROW = "scenario.cluster_actions_grow_grp"
 CLUSTER_ACTIONS_GROW_WAIT = "scenario.cluster_actions_grow_wait_grp"
 CLUSTER_ACTIONS_SHRINK = "scenario.cluster_actions_shrink_grp"
 CLUSTER_ACTIONS_SHRINK_WAIT = "scenario.cluster_actions_shrink_wait_grp"
+CLUSTER_ACTIONS_RESTART = "scenario.cluster_actions_restart_grp"
+CLUSTER_ACTIONS_RESTART_WAIT = "scenario.cluster_actions_restart_wait_grp"


 # Cluster Create Group (in cluster_actions file)
--- a/trove/tests/scenario/groups/cluster_group.py
+++ b/trove/tests/scenario/groups/cluster_group.py
@ -92,8 +92,44 @@ class ClusterCreateWaitGroup(TestGroup):


@test(groups=[GROUP, groups.CLUSTER_ACTIONS,
-              groups.CLUSTER_ACTIONS_ROOT_ENABLE],
+              groups.CLUSTER_ACTIONS_RESTART],
      depends_on_groups=[groups.CLUSTER_CREATE_WAIT])
+class ClusterRestartGroup(TestGroup):
+
+    def __init__(self):
+        super(ClusterRestartGroup, self).__init__(
+            ClusterRunnerFactory.instance())
+
+    @test
+    def cluster_restart(self):
+        """Restart the cluster."""
+        self.test_runner.run_cluster_restart()
+
+
+@test(groups=[GROUP, groups.CLUSTER_ACTIONS,
+              groups.CLUSTER_ACTIONS_RESTART_WAIT],
+      depends_on_groups=[groups.CLUSTER_ACTIONS_RESTART])
+class ClusterRestartWaitGroup(TestGroup):
+
+    def __init__(self):
+        super(ClusterRestartWaitGroup, self).__init__(
+            ClusterRunnerFactory.instance())
+
+    @test
+    def cluster_restart_wait(self):
+        """Wait for cluster restart to complete."""
+        self.test_runner.run_cluster_restart_wait()
+
+    @test(depends_on=[cluster_restart_wait])
+    def verify_initial_cluster_data(self):
+        """Verify the initial data still exists after cluster restart."""
+        self.test_runner.run_verify_initial_cluster_data()
+
+
+@test(groups=[GROUP, groups.CLUSTER_ACTIONS,
+              groups.CLUSTER_ACTIONS_ROOT_ENABLE],
+      depends_on_groups=[groups.CLUSTER_CREATE_WAIT],
+      runs_after_groups=[groups.CLUSTER_ACTIONS_RESTART_WAIT])
 class ClusterRootEnableGroup(TestGroup):

    def __init__(self):
@ -308,7 +344,8 @@ class ClusterRootEnableShrinkGroup(TestGroup):
                         groups.CLUSTER_ACTIONS_ROOT_SHRINK,
                         groups.CLUSTER_ACTIONS_GROW_WAIT,
                         groups.CLUSTER_ACTIONS_SHRINK_WAIT,
-                         groups.CLUSTER_UPGRADE_WAIT])
+                         groups.CLUSTER_UPGRADE_WAIT,
+                         groups.CLUSTER_ACTIONS_RESTART_WAIT])
 class ClusterDeleteGroup(TestGroup):

    def __init__(self):
--- a/trove/tests/scenario/runners/cluster_runners.py
+++ b/trove/tests/scenario/runners/cluster_runners.py
@ -160,6 +160,34 @@ class ClusterRunner(TestRunner):
        self.assert_cluster_show(
            self.cluster_id, expected_task_name, expected_http_code)

+    def run_cluster_restart(self, expected_http_code=202,
+                            expected_task_name='RESTARTING_CLUSTER'):
+        self.assert_cluster_restart(
+            self.cluster_id, expected_task_name, expected_http_code)
+
+    def assert_cluster_restart(
+            self, cluster_id, expected_task_name, expected_http_code):
+        client = self.auth_client
+        client.clusters.restart(cluster_id)
+        self.assert_client_code(client, expected_http_code)
+        self._assert_cluster_response(
+            client, cluster_id, expected_task_name)
+
+    def run_cluster_restart_wait(self):
+        self.assert_cluster_restart_wait(self.cluster_id)
+
+    def assert_cluster_restart_wait(self, cluster_id):
+        client = self.auth_client
+        cluster_instances = self._get_cluster_instances(
+            client, cluster_id)
+        self.assert_all_instance_states(
+            cluster_instances, ['REBOOT', 'ACTIVE'])
+
+        self._assert_cluster_states(
+            client, cluster_id, ['NONE'])
+        self._assert_cluster_response(
+            client, cluster_id, 'NONE')
+
    def assert_cluster_show(self, cluster_id, expected_task_name,
                            expected_http_code):
        self._assert_cluster_response(self.auth_client,