add strategy host_maintenance

maintain one compute node without having the user's application
been interruptted.
It will firstly migrate all instances from the maintenance node
to one backup node. If not, it will migrate all instances,
relying on nova-schduler.

Change-Id: I29ecb65745d5e6ecab41508e9a91b29b39a3f0a8
Implements:blueprint cluster-maintaining
This commit is contained in:
suzhengwei 2017-06-30 15:43:45 +08:00
parent 40a653215f
commit 58276ec79e
7 changed files with 586 additions and 1 deletions

View File

@ -0,0 +1,9 @@
---
features:
- |
Added a strategy for one compute node maintenance,
without having the user's application been interrupted.
If given one backup node, the strategy will firstly
migrate all instances from the maintenance node to
the backup node. If the backup node is not provided,
it will migrate all instances, relying on nova-scheduler.

View File

@ -58,6 +58,7 @@ watcher_goals =
noisy_neighbor = watcher.decision_engine.goal.goals:NoisyNeighborOptimization noisy_neighbor = watcher.decision_engine.goal.goals:NoisyNeighborOptimization
saving_energy = watcher.decision_engine.goal.goals:SavingEnergy saving_energy = watcher.decision_engine.goal.goals:SavingEnergy
hardware_maintenance = watcher.decision_engine.goal.goals:HardwareMaintenance hardware_maintenance = watcher.decision_engine.goal.goals:HardwareMaintenance
cluster_maintaining = watcher.decision_engine.goal.goals:ClusterMaintaining
watcher_scoring_engines = watcher_scoring_engines =
dummy_scorer = watcher.decision_engine.scoring.dummy_scorer:DummyScorer dummy_scorer = watcher.decision_engine.scoring.dummy_scorer:DummyScorer
@ -80,6 +81,7 @@ watcher_strategies =
noisy_neighbor = watcher.decision_engine.strategy.strategies.noisy_neighbor:NoisyNeighbor noisy_neighbor = watcher.decision_engine.strategy.strategies.noisy_neighbor:NoisyNeighbor
storage_capacity_balance = watcher.decision_engine.strategy.strategies.storage_capacity_balance:StorageCapacityBalance storage_capacity_balance = watcher.decision_engine.strategy.strategies.storage_capacity_balance:StorageCapacityBalance
zone_migration = watcher.decision_engine.strategy.strategies.zone_migration:ZoneMigration zone_migration = watcher.decision_engine.strategy.strategies.zone_migration:ZoneMigration
host_maintenance = watcher.decision_engine.strategy.strategies.host_maintenance:HostMaintenance
watcher_actions = watcher_actions =
migrate = watcher.applier.actions.migration:Migrate migrate = watcher.applier.actions.migration:Migrate

View File

@ -241,3 +241,28 @@ class HardwareMaintenance(base.Goal):
def get_efficacy_specification(cls): def get_efficacy_specification(cls):
"""The efficacy spec for the current goal""" """The efficacy spec for the current goal"""
return specs.HardwareMaintenance() return specs.HardwareMaintenance()
class ClusterMaintaining(base.Goal):
"""ClusterMaintenance
This goal is used to maintain compute nodes
without having the user's application being interrupted.
"""
@classmethod
def get_name(cls):
return "cluster_maintaining"
@classmethod
def get_display_name(cls):
return _("Cluster Maintaining")
@classmethod
def get_translatable_display_name(cls):
return "Cluster Maintaining"
@classmethod
def get_efficacy_specification(cls):
"""The efficacy spec for the current goal"""
return specs.Unclassified()

View File

@ -18,6 +18,7 @@ from watcher.decision_engine.strategy.strategies import actuation
from watcher.decision_engine.strategy.strategies import basic_consolidation from watcher.decision_engine.strategy.strategies import basic_consolidation
from watcher.decision_engine.strategy.strategies import dummy_strategy from watcher.decision_engine.strategy.strategies import dummy_strategy
from watcher.decision_engine.strategy.strategies import dummy_with_scorer from watcher.decision_engine.strategy.strategies import dummy_with_scorer
from watcher.decision_engine.strategy.strategies import host_maintenance
from watcher.decision_engine.strategy.strategies import noisy_neighbor from watcher.decision_engine.strategy.strategies import noisy_neighbor
from watcher.decision_engine.strategy.strategies import outlet_temp_control from watcher.decision_engine.strategy.strategies import outlet_temp_control
from watcher.decision_engine.strategy.strategies import saving_energy from watcher.decision_engine.strategy.strategies import saving_energy
@ -44,9 +45,10 @@ WorkloadStabilization = workload_stabilization.WorkloadStabilization
UniformAirflow = uniform_airflow.UniformAirflow UniformAirflow = uniform_airflow.UniformAirflow
NoisyNeighbor = noisy_neighbor.NoisyNeighbor NoisyNeighbor = noisy_neighbor.NoisyNeighbor
ZoneMigration = zone_migration.ZoneMigration ZoneMigration = zone_migration.ZoneMigration
HostMaintenance = host_maintenance.HostMaintenance
__all__ = ("Actuator", "BasicConsolidation", "OutletTempControl", __all__ = ("Actuator", "BasicConsolidation", "OutletTempControl",
"DummyStrategy", "DummyWithScorer", "VMWorkloadConsolidation", "DummyStrategy", "DummyWithScorer", "VMWorkloadConsolidation",
"WorkloadBalance", "WorkloadStabilization", "UniformAirflow", "WorkloadBalance", "WorkloadStabilization", "UniformAirflow",
"NoisyNeighbor", "SavingEnergy", "StorageCapacityBalance", "NoisyNeighbor", "SavingEnergy", "StorageCapacityBalance",
"ZoneMigration") "ZoneMigration", "HostMaintenance")

10
watcher/decision_engine/strategy/strategies/base.py Normal file → Executable file
View File

@ -471,3 +471,13 @@ class ZoneMigrationBaseStrategy(BaseStrategy):
@classmethod @classmethod
def get_goal_name(cls): def get_goal_name(cls):
return "hardware_maintenance" return "hardware_maintenance"
@six.add_metaclass(abc.ABCMeta)
class HostMaintenanceBaseStrategy(BaseStrategy):
REASON_FOR_MAINTAINING = 'watcher_maintaining'
@classmethod
def get_goal_name(cls):
return "cluster_maintaining"

View File

@ -0,0 +1,331 @@
# -*- encoding: utf-8 -*-
# Copyright (c) 2017 chinac.com
#
# Authors: suzhengwei<suzhengwei@chinac.com>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from oslo_log import log
import six
from watcher._i18n import _
from watcher.common import exception as wexc
from watcher.decision_engine.model import element
from watcher.decision_engine.strategy.strategies import base
LOG = log.getLogger(__name__)
class HostMaintenance(base.HostMaintenanceBaseStrategy):
"""[PoC]Host Maintenance
*Description*
It is a migration strategy for one compute node maintenance,
without having the user's application been interruptted.
If given one backup node, the strategy will firstly
migrate all instances from the maintenance node to
the backup node. If the backup node is not provided,
it will migrate all instances, relying on nova-scheduler.
*Requirements*
* You must have at least 2 physical compute nodes to run this strategy.
*Limitations*
- This is a proof of concept that is not meant to be used in production
- It migrates all instances from one host to other hosts. It's better to
execute such strategy when load is not heavy, and use this algorithm
with `ONESHOT` audit.
- It assume that cold and live migrations are possible
"""
INSTANCE_MIGRATION = "migrate"
CHANGE_NOVA_SERVICE_STATE = "change_nova_service_state"
REASON_FOR_DISABLE = 'watcher_disabled'
def __init__(self, config, osc=None):
super(HostMaintenance, self).__init__(config, osc)
@classmethod
def get_name(cls):
return "host_maintenance"
@classmethod
def get_display_name(cls):
return _("Host Maintenance Strategy")
@classmethod
def get_translatable_display_name(cls):
return "Host Maintenance Strategy"
@classmethod
def get_schema(cls):
return {
"properties": {
"maintenance_node": {
"description": "The name of the compute node which "
"need maintenance",
"type": "string",
},
"backup_node": {
"description": "The name of the compute node which "
"will backup the maintenance node.",
"type": "string",
},
},
"required": ["maintenance_node"],
}
def get_disabled_compute_nodes_with_reason(self, reason=None):
return {uuid: cn for uuid, cn in
self.compute_model.get_all_compute_nodes().items()
if cn.state == element.ServiceState.ONLINE.value and
cn.status == element.ServiceState.DISABLED.value and
cn.disabled_reason == reason}
def get_disabled_compute_nodes(self):
return self.get_disabled_compute_nodes_with_reason(
self.REASON_FOR_DISABLE)
def get_instance_state_str(self, instance):
"""Get instance state in string format"""
if isinstance(instance.state, six.string_types):
return instance.state
elif isinstance(instance.state, element.InstanceState):
return instance.state.value
else:
LOG.error('Unexpected instance state type, '
'state=%(state)s, state_type=%(st)s.',
dict(state=instance.state,
st=type(instance.state)))
raise wexc.WatcherException
def get_node_status_str(self, node):
"""Get node status in string format"""
if isinstance(node.status, six.string_types):
return node.status
elif isinstance(node.status, element.ServiceState):
return node.status.value
else:
LOG.error('Unexpected node status type, '
'status=%(status)s, status_type=%(st)s.',
dict(status=node.status,
st=type(node.status)))
raise wexc.WatcherException
def get_node_capacity(self, node):
"""Collect cpu, ram and disk capacity of a node.
:param node: node object
:return: dict(cpu(cores), ram(MB), disk(B))
"""
return dict(cpu=node.vcpus,
ram=node.memory,
disk=node.disk_capacity)
def get_node_used(self, node):
"""Collect cpu, ram and disk used of a node.
:param node: node object
:return: dict(cpu(cores), ram(MB), disk(B))
"""
vcpus_used = 0
memory_used = 0
disk_used = 0
for instance in self.compute_model.get_node_instances(node):
vcpus_used += instance.vcpus
memory_used += instance.memory
disk_used += instance.disk
return dict(cpu=vcpus_used,
ram=memory_used,
disk=disk_used)
def get_node_free(self, node):
"""Collect cpu, ram and disk free of a node.
:param node: node object
:return: dict(cpu(cores), ram(MB), disk(B))
"""
node_capacity = self.get_node_capacity(node)
node_used = self.get_node_used(node)
return dict(cpu=node_capacity['cpu']-node_used['cpu'],
ram=node_capacity['ram']-node_used['ram'],
disk=node_capacity['disk']-node_used['disk'],
)
def host_fits(self, source_node, destination_node):
"""check host fits
return True if VMs could intensively migrate
from source_node to destination_node.
"""
source_node_used = self.get_node_used(source_node)
destination_node_free = self.get_node_free(destination_node)
metrics = ['cpu', 'ram']
for m in metrics:
if source_node_used[m] > destination_node_free[m]:
return False
return True
def add_action_enable_compute_node(self, node):
"""Add an action for node enabler into the solution."""
params = {'state': element.ServiceState.ENABLED.value}
self.solution.add_action(
action_type=self.CHANGE_NOVA_SERVICE_STATE,
resource_id=node.uuid,
input_parameters=params)
def add_action_maintain_compute_node(self, node):
"""Add an action for node maintenance into the solution."""
params = {'state': element.ServiceState.DISABLED.value,
'disabled_reason': self.REASON_FOR_MAINTAINING}
self.solution.add_action(
action_type=self.CHANGE_NOVA_SERVICE_STATE,
resource_id=node.uuid,
input_parameters=params)
def enable_compute_node_if_disabled(self, node):
node_status_str = self.get_node_status_str(node)
if node_status_str != element.ServiceState.ENABLED.value:
self.add_action_enable_compute_node(node)
def instance_migration(self, instance, src_node, des_node=None):
"""Add an action for instance migration into the solution.
:param instance: instance object
:param src_node: node object
:param des_node: node object. if None, the instance will be
migrated relying on nova-scheduler
:return: None
"""
instance_state_str = self.get_instance_state_str(instance)
if instance_state_str == element.InstanceState.ACTIVE.value:
migration_type = 'live'
else:
migration_type = 'cold'
params = {'migration_type': migration_type,
'source_node': src_node.uuid}
if des_node:
params['destination_node'] = des_node.uuid
self.solution.add_action(action_type=self.INSTANCE_MIGRATION,
resource_id=instance.uuid,
input_parameters=params)
def host_migration(self, source_node, destination_node):
"""host migration
Migrate all instances from source_node to destination_node.
Active instances use "live-migrate",
and other instances use "cold-migrate"
"""
instances = self.compute_model.get_node_instances(source_node)
for instance in instances:
self.instance_migration(instance, source_node, destination_node)
def safe_maintain(self, maintenance_node, backup_node=None):
"""safe maintain one compute node
Migrate all instances of the maintenance_node intensively to the
backup host. If users didn't give the backup host, it will select
one unused node to backup the maintaining node.
It calculate the resource both of the backup node and maintaining
node to evaluate the migrations from maintaining node to backup node.
If all instances of the maintaining node can migrated to
the backup node, it will set the maintaining node in
'watcher_maintaining' status., and add the migrations to solution.
"""
# If user gives a backup node with required capacity, then migrate
# all instances from the maintaining node to the backup node.
if backup_node:
if self.host_fits(maintenance_node, backup_node):
self.enable_compute_node_if_disabled(backup_node)
self.add_action_maintain_compute_node(maintenance_node)
self.host_migration(maintenance_node, backup_node)
return True
# If uses didn't give the backup host, select one unused node
# with required capacity, then migrate all instances
# from maintaining node to it.
nodes = sorted(
self.get_disabled_compute_nodes().values(),
key=lambda x: self.get_node_capacity(x)['cpu'])
if maintenance_node in nodes:
nodes.remove(maintenance_node)
for node in nodes:
if self.host_fits(maintenance_node, node):
self.enable_compute_node_if_disabled(node)
self.add_action_maintain_compute_node(maintenance_node)
self.host_migration(maintenance_node, node)
return True
return False
def try_maintain(self, maintenance_node):
"""try to maintain one compute node
It firstly set the maintenance_node in 'watcher_maintaining' status.
Then try to migrate all instances of the maintenance node, rely
on nova-scheduler.
"""
self.add_action_maintain_compute_node(maintenance_node)
instances = self.compute_model.get_node_instances(maintenance_node)
for instance in instances:
self.instance_migration(instance, maintenance_node)
def pre_execute(self):
LOG.debug(self.compute_model.to_string())
if not self.compute_model:
raise wexc.ClusterStateNotDefined()
if self.compute_model.stale:
raise wexc.ClusterStateStale()
def do_execute(self):
LOG.info(_('Executing Host Maintenance Migration Strategy'))
maintenance_node = self.input_parameters.get('maintenance_node')
backup_node = self.input_parameters.get('backup_node')
# if no VMs in the maintenance_node, just maintain the compute node
src_node = self.compute_model.get_node_by_uuid(maintenance_node)
if len(self.compute_model.get_node_instances(src_node)) == 0:
if (src_node.disabled_reason !=
self.REASON_FOR_MAINTAINING):
self.add_action_maintain_compute_node(src_node)
return
if backup_node:
des_node = self.compute_model.get_node_by_uuid(backup_node)
else:
des_node = None
if not self.safe_maintain(src_node, des_node):
self.try_maintain(src_node)
def post_execute(self):
"""Post-execution phase
This can be used to compute the global efficacy
"""
LOG.debug(self.solution.actions)
LOG.debug(self.compute_model.to_string())

View File

@ -0,0 +1,206 @@
# -*- encoding: utf-8 -*-
# Copyright (c) 2017 chinac.com
#
# Authors: suzhengwei<suzhengwei@chinac.com>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import mock
from watcher.common import exception
from watcher.decision_engine.model import model_root
from watcher.decision_engine.strategy import strategies
from watcher.tests import base
from watcher.tests.decision_engine.model import faker_cluster_state
class TestHostMaintenance(base.TestCase):
def setUp(self):
super(TestHostMaintenance, self).setUp()
# fake cluster
self.fake_cluster = faker_cluster_state.FakerModelCollector()
p_model = mock.patch.object(
strategies.HostMaintenance, "compute_model",
new_callable=mock.PropertyMock)
self.m_model = p_model.start()
self.addCleanup(p_model.stop)
p_audit_scope = mock.patch.object(
strategies.HostMaintenance, "audit_scope",
new_callable=mock.PropertyMock
)
self.m_audit_scope = p_audit_scope.start()
self.addCleanup(p_audit_scope.stop)
self.m_audit_scope.return_value = mock.Mock()
self.m_model.return_value = model_root.ModelRoot()
self.strategy = strategies.HostMaintenance(config=mock.Mock())
def test_exception_stale_cdm(self):
self.fake_cluster.set_cluster_data_model_as_stale()
self.m_model.return_value = self.fake_cluster.cluster_data_model
self.assertRaises(
exception.ClusterStateNotDefined,
self.strategy.execute)
def test_get_node_capacity(self):
model = self.fake_cluster.generate_scenario_1()
self.m_model.return_value = model
node_0 = model.get_node_by_uuid("Node_0")
node_capacity = dict(cpu=40, ram=132, disk=250)
self.assertEqual(node_capacity,
self.strategy.get_node_capacity(node_0))
def test_get_node_used(self):
model = self.fake_cluster.generate_scenario_1()
self.m_model.return_value = model
node_0 = model.get_node_by_uuid("Node_0")
node_used = dict(cpu=20, ram=4, disk=40)
self.assertEqual(node_used,
self.strategy.get_node_used(node_0))
def test_get_node_free(self):
model = self.fake_cluster.generate_scenario_1()
self.m_model.return_value = model
node_0 = model.get_node_by_uuid("Node_0")
node_free = dict(cpu=20, ram=128, disk=210)
self.assertEqual(node_free,
self.strategy.get_node_free(node_0))
def test_host_fits(self):
model = self.fake_cluster.generate_scenario_1()
self.m_model.return_value = model
node_0 = model.get_node_by_uuid("Node_0")
node_1 = model.get_node_by_uuid("Node_1")
self.assertTrue(self.strategy.host_fits(node_0, node_1))
def test_add_action_enable_compute_node(self):
model = self.fake_cluster.generate_scenario_1()
self.m_model.return_value = model
node_0 = model.get_node_by_uuid('Node_0')
self.strategy.add_action_enable_compute_node(node_0)
expected = [{'action_type': 'change_nova_service_state',
'input_parameters': {
'state': 'enabled',
'resource_id': 'Node_0'}}]
self.assertEqual(expected, self.strategy.solution.actions)
def test_add_action_maintain_compute_node(self):
model = self.fake_cluster.generate_scenario_1()
self.m_model.return_value = model
node_0 = model.get_node_by_uuid('Node_0')
self.strategy.add_action_maintain_compute_node(node_0)
expected = [{'action_type': 'change_nova_service_state',
'input_parameters': {
'state': 'disabled',
'disabled_reason': 'watcher_maintaining',
'resource_id': 'Node_0'}}]
self.assertEqual(expected, self.strategy.solution.actions)
def test_instance_migration(self):
model = self.fake_cluster.generate_scenario_1()
self.m_model.return_value = model
node_0 = model.get_node_by_uuid('Node_0')
node_1 = model.get_node_by_uuid('Node_1')
instance_0 = model.get_instance_by_uuid("INSTANCE_0")
self.strategy.instance_migration(instance_0, node_0, node_1)
self.assertEqual(1, len(self.strategy.solution.actions))
expected = [{'action_type': 'migrate',
'input_parameters': {'destination_node': node_1.uuid,
'source_node': node_0.uuid,
'migration_type': 'live',
'resource_id': instance_0.uuid}}]
self.assertEqual(expected, self.strategy.solution.actions)
def test_instance_migration_without_dest_node(self):
model = self.fake_cluster.generate_scenario_1()
self.m_model.return_value = model
node_0 = model.get_node_by_uuid('Node_0')
instance_0 = model.get_instance_by_uuid("INSTANCE_0")
self.strategy.instance_migration(instance_0, node_0)
self.assertEqual(1, len(self.strategy.solution.actions))
expected = [{'action_type': 'migrate',
'input_parameters': {'source_node': node_0.uuid,
'migration_type': 'live',
'resource_id': instance_0.uuid}}]
self.assertEqual(expected, self.strategy.solution.actions)
def test_host_migration(self):
model = self.fake_cluster.generate_scenario_1()
self.m_model.return_value = model
node_0 = model.get_node_by_uuid('Node_0')
node_1 = model.get_node_by_uuid('Node_1')
instance_0 = model.get_instance_by_uuid("INSTANCE_0")
instance_1 = model.get_instance_by_uuid("INSTANCE_1")
self.strategy.host_migration(node_0, node_1)
self.assertEqual(2, len(self.strategy.solution.actions))
expected = [{'action_type': 'migrate',
'input_parameters': {'destination_node': node_1.uuid,
'source_node': node_0.uuid,
'migration_type': 'live',
'resource_id': instance_0.uuid}},
{'action_type': 'migrate',
'input_parameters': {'destination_node': node_1.uuid,
'source_node': node_0.uuid,
'migration_type': 'live',
'resource_id': instance_1.uuid}}]
self.assertIn(expected[0], self.strategy.solution.actions)
self.assertIn(expected[1], self.strategy.solution.actions)
def test_safe_maintain(self):
model = self.fake_cluster.generate_scenario_1()
self.m_model.return_value = model
node_0 = model.get_node_by_uuid('Node_0')
node_1 = model.get_node_by_uuid('Node_1')
self.assertFalse(self.strategy.safe_maintain(node_0))
self.assertFalse(self.strategy.safe_maintain(node_1))
def test_try_maintain(self):
model = self.fake_cluster.generate_scenario_1()
self.m_model.return_value = model
node_1 = model.get_node_by_uuid('Node_1')
self.strategy.try_maintain(node_1)
self.assertEqual(2, len(self.strategy.solution.actions))
def test_strategy(self):
model = self.fake_cluster. \
generate_scenario_9_with_3_active_plus_1_disabled_nodes()
self.m_model.return_value = model
node_2 = model.get_node_by_uuid('Node_2')
node_3 = model.get_node_by_uuid('Node_3')
instance_4 = model.get_instance_by_uuid("INSTANCE_4")
if not self.strategy.safe_maintain(node_2, node_3):
self.strategy.try_maintain(node_2)
expected = [{'action_type': 'change_nova_service_state',
'input_parameters': {
'resource_id': 'Node_3',
'state': 'enabled'}},
{'action_type': 'change_nova_service_state',
'input_parameters': {
'resource_id': 'Node_2',
'state': 'disabled',
'disabled_reason': 'watcher_maintaining'}},
{'action_type': 'migrate',
'input_parameters': {
'destination_node': node_3.uuid,
'source_node': node_2.uuid,
'migration_type': 'live',
'resource_id': instance_4.uuid}}]
self.assertEqual(expected, self.strategy.solution.actions)