ironic/tools/benchmark/generate-statistics.py
Julia Kreger ffff76a682 Add basic tools for benchmarking
Adds a horribly written, just hacked together little tool to help
provide sizing insight into an ironic deployment's state and underlying
performance.

Key data:
* Queries the list of node from a pure python interface level with the
  database and reports timeing for the list of nodes to be returned.
  This information helps convey how long a periodic hits the database
  just for the query.
* Requests *all* nodes using the query pattern/structure of the nova
  resource tracker, and uses the marker to make any additional requsts.
  The data is parsed, and collected, and counts identified vendors,
  if any.
* Collects basic data on conductors in terms of running, conductor groups
  as well as currently loaded drivers in the deployment.

All of this information provides operational insight into *what*
conditions exist within the deployment allowing developers to try
and identify solutions based on the unique circumstances of larger
deployments.

Also adds a utility to generate and semi-randomize data to allow us to
create a benchmark job in CI.

Change-Id: Iae660aea82db8f1c4567ee2982595ccfdf434fe3
2021-05-24 16:36:02 +00:00

196 lines
7.0 KiB
Python

#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import datetime
import sys
import time
from unittest import mock
from ironic_lib import metrics_utils
import oslo_policy
from oslo_utils import timeutils
from ironic.api.controllers.v1 import node as node_api
from ironic.api.controllers.v1 import utils as api_utils
from ironic.common import context
from ironic.common import service
from ironic.conf import CONF # noqa To Load Configuration
from ironic.db import api as db_api
from ironic.objects import conductor
from ironic.objects import node
def _calculate_delta(start, finish):
return finish - start
def _add_a_line():
print('------------------------------------------------------------')
def _assess_db_performance():
start = time.time()
dbapi = db_api.get_instance()
print('Phase - Assess DB performance')
_add_a_line()
got_connection = time.time()
nodes = dbapi.get_node_list()
node_count = len(nodes)
query_complete = time.time()
delta = _calculate_delta(start, got_connection)
print('Obtained DB client in %s seconds.' % delta)
delta = _calculate_delta(got_connection, query_complete)
print('Returned %s nodes in python %s seconds from the DB.\n' %
(node_count, delta))
# return node count for future use.
return node_count
def _assess_db_and_object_performance():
print('Phase - Assess DB & Object conversion Performance')
_add_a_line()
start = time.time()
node_list = node.Node().list(context.get_admin_context())
got_list = time.time()
delta = _calculate_delta(start, got_list)
print('Obtained list of node objects in %s seconds.' % delta)
count = 0
tbl_size = 0
# In a sense, this helps provide a relative understanding if the
# database is the bottleneck, or the objects post conversion.
# converting completely to json and then measuring the size helps
# ensure that everything is "assessed" while not revealing too
# much detail.
for node_obj in node_list:
# Just looping through the entire set to count should be
# enough to ensure that the entry is loaded from the db
# and then converted to an object.
tbl_size = tbl_size + sys.getsizeof(node_obj.as_dict(secure=True))
count = count + 1
delta = _calculate_delta(got_list, time.time())
print('Took %s seconds to iterate through %s node objects.' %
(delta, count))
print('Nodes table is roughly %s bytes of JSON.\n' % tbl_size)
observed_vendors = []
for node_obj in node_list:
vendor = node_obj.driver_internal_info.get('vendor')
if vendor:
observed_vendors.append(vendor)
@mock.patch('ironic.api.request') # noqa patch needed for the object model
@mock.patch.object(metrics_utils, 'get_metrics_logger', lambda *_: mock.Mock)
@mock.patch.object(api_utils, 'check_list_policy', lambda *_: None)
@mock.patch.object(api_utils, 'check_allow_specify_fields', lambda *_: None)
@mock.patch.object(api_utils, 'check_allowed_fields', lambda *_: None)
@mock.patch.object(oslo_policy.policy, 'LOG', autospec=True)
def _assess_db_object_and_api_performance(mock_log, mock_request):
print('Phase - Assess DB & Object conversion Performance')
_add_a_line()
# Just mock it to silence it since getting the logger to update
# config seems like not a thing once started. :\
mock_log.debug = mock.Mock()
# Internal logic requires major/minor versions and a context to
# proceed. This is just to make the NodesController respond properly.
mock_request.context = context.get_admin_context()
mock_request.version.major = 1
mock_request.version.minor = 71
start = time.time()
node_api_controller = node_api.NodesController()
node_api_controller.context = context.get_admin_context()
fields = ("uuid,power_state,target_power_state,provision_state,"
"target_provision_state,last_error,maintenance,properties,"
"instance_uuid,traits,resource_class")
total_nodes = 0
res = node_api_controller._get_nodes_collection(
chassis_uuid=None,
instance_uuid=None,
associated=None,
maintenance=None,
retired=None,
provision_state=None,
marker=None,
limit=None,
sort_key="id",
sort_dir="asc",
fields=fields.split(','))
total_nodes = len(res['nodes'])
while len(res['nodes']) != 1:
print(" ** Getting nodes ** %s Elapsed: %s seconds." %
(total_nodes, _calculate_delta(start, time.time())))
res = node_api_controller._get_nodes_collection(
chassis_uuid=None,
instance_uuid=None,
associated=None,
maintenance=None,
retired=None,
provision_state=None,
marker=res['nodes'][-1]['uuid'],
limit=None,
sort_key="id",
sort_dir="asc",
fields=fields.split(','))
new_nodes = len(res['nodes'])
if new_nodes == 0:
break
total_nodes = total_nodes + new_nodes
delta = _calculate_delta(start, time.time())
print('Took %s seconds to return all %s nodes via '
'nodes API call pattern.\n' % (delta, total_nodes))
def _report_conductors():
print('Phase - identifying conductors/drivers')
_add_a_line()
conductors = conductor.Conductor().list(
context.get_admin_context(),
)
drivers = []
groups = []
online_count = 0
online_by = timeutils.utcnow(with_timezone=True) - \
datetime.timedelta(seconds=90)
for conductor_obj in conductors:
if conductor_obj.conductor_group:
groups.append(conductor_obj.conductor_group)
if conductor_obj.updated_at > online_by:
online_count = online_count + 1
for driver in conductor_obj.drivers:
drivers.append(driver)
conductor_count = len(conductors)
print('Conductor count: %s' % conductor_count)
print('Online conductor count: %s' % online_count)
running_with_groups = len(groups)
print('Conductors with conductor_groups: %s' % running_with_groups)
group_count = len(set(groups))
print('Conductor group count: %s' % group_count)
driver_list = list(set(drivers))
print('Presently supported drivers: %s' % driver_list)
def main():
service.prepare_service()
CONF.set_override('debug', False)
_assess_db_performance()
_assess_db_and_object_performance()
_assess_db_object_and_api_performance()
_report_conductors()
if __name__ == '__main__':
sys.exit(main())