Async task support

We have a *ton* of stuff in devstack that is very linear, specifically
the ten-ish minutes we spend loading osc to run a single API command
against something. We also generate configs, sync databases, and other
things that use one core of our worker and make our runtime longer
than it really needs to be.

The idea in this patch is to make it super simple to run some things
in the background and then wait for them to finish before proceeding
to something that will require them to be done. This avoids the
interleaving you would expect by redirecting the async tasks to a log
file, and then cat'ing that log file synchronously during the wait
operation. The per-task log file remains so it's easier to examine
it in isolation.

Multiple people have reported between 22-30% improvement in the
time it takes to stack with this. More can be done, but what is here
already makes a significant difference.

Change-Id: I270a910b531641b023c13f75dfedca057a1f1031
This commit is contained in:
Dan Smith 2021-01-19 12:10:52 -08:00
parent 84ead5aacf
commit 30d9bf9a6d
9 changed files with 315 additions and 44 deletions

View File

@ -598,6 +598,17 @@
nodeset: openstack-single-node-bionic
voting: false
- job:
name: devstack-async
parent: tempest-full-py3
description: Async mode enabled
voting: false
vars:
devstack_localrc:
DEVSTACK_PARALLEL: True
zuul_copy_output:
/opt/stack/async: logs
- job:
name: devstack-platform-fedora-latest
parent: tempest-full-py3
@ -690,6 +701,7 @@
- devstack-platform-fedora-latest
- devstack-platform-centos-8
- devstack-platform-bionic
- devstack-async
- devstack-multinode
- devstack-unit-tests
- openstack-tox-bashate

View File

@ -113,7 +113,7 @@ cleanup_rpc_backend
cleanup_database
# Clean out data and status
sudo rm -rf $DATA_DIR $DEST/status
sudo rm -rf $DATA_DIR $DEST/status $DEST/async
# Clean out the log file and log directories
if [[ -n "$LOGFILE" ]] && [[ -f "$LOGFILE" ]]; then

View File

@ -6,7 +6,7 @@ if is_service_enabled tempest; then
source $TOP_DIR/lib/tempest
elif [[ "$1" == "stack" && "$2" == "install" ]]; then
echo_summary "Installing Tempest"
install_tempest
async_runfunc install_tempest
elif [[ "$1" == "stack" && "$2" == "post-config" ]]; then
# Tempest config must come after layer 2 services are running
:
@ -17,6 +17,7 @@ if is_service_enabled tempest; then
# local.conf Tempest option overrides
:
elif [[ "$1" == "stack" && "$2" == "test-config" ]]; then
async_wait install_tempest
echo_summary "Initializing Tempest"
configure_tempest
echo_summary "Installing Tempest Plugins"

View File

@ -21,6 +21,7 @@ source ${FUNC_DIR}/inc/ini-config
source ${FUNC_DIR}/inc/meta-config
source ${FUNC_DIR}/inc/python
source ${FUNC_DIR}/inc/rootwrap
source ${FUNC_DIR}/inc/async
# Save trace setting
_XTRACE_FUNCTIONS=$(set +o | grep xtrace)

225
inc/async Normal file
View File

@ -0,0 +1,225 @@
#!/bin/bash
#
# Symbolic asynchronous tasks for devstack
#
# Usage:
#
# async_runfunc my_shell_func foo bar baz
#
# ... do other stuff ...
#
# async_wait my_shell_func
#
DEVSTACK_PARALLEL=$(trueorfalse False DEVSTACK_PARALLEL)
_ASYNC_BG_TIME=0
# Keep track of how much total time was spent in background tasks
# Takes a job runtime in ms.
function _async_incr_bg_time {
local elapsed_ms="$1"
_ASYNC_BG_TIME=$(($_ASYNC_BG_TIME + $elapsed_ms))
}
# Get the PID of a named future to wait on
function async_pidof {
local name="$1"
local inifile="${DEST}/async/${name}.ini"
if [ -f "$inifile" ]; then
iniget $inifile job pid
else
echo 'UNKNOWN'
return 1
fi
}
# Log a message about a job. If the message contains "%command" then the
# full command line of the job will be substituted in the output
function async_log {
local name="$1"
shift
local message="$*"
local inifile=${DEST}/async/${name}.ini
local pid
local command
pid=$(iniget $inifile job pid)
command=$(iniget $inifile job command | tr '#' '-')
message=$(echo "$message" | sed "s#%command#$command#g")
echo "[Async ${name}:${pid}]: $message"
}
# Inner function that actually runs the requested task. We wrap it like this
# just so we can emit a finish message as soon as the work is done, to make
# it easier to find the tracking just before an error.
function async_inner {
local name="$1"
local rc
shift
set -o xtrace
if $* >${DEST}/async/${name}.log 2>&1; then
rc=0
set +o xtrace
async_log "$name" "finished successfully"
else
rc=$?
set +o xtrace
async_log "$name" "FAILED with rc $rc"
fi
iniset ${DEST}/async/${name}.ini job end_time $(date "+%s%3N")
return $rc
}
# Run something async. Takes a symbolic name and a list of arguments of
# what to run. Ideally this would be rarely used and async_runfunc() would
# be used everywhere for readability.
#
# This spawns the work in a background worker, records a "future" to be
# collected by a later call to async_wait()
function async_run {
local xtrace
xtrace=$(set +o | grep xtrace)
set +o xtrace
local name="$1"
shift
local inifile=${DEST}/async/${name}.ini
touch $inifile
iniset $inifile job command "$*"
iniset $inifile job start_time $(date +%s%3N)
if [[ "$DEVSTACK_PARALLEL" = "True" ]]; then
async_inner $name $* &
iniset $inifile job pid $!
async_log "$name" "running: %command"
$xtrace
else
iniset $inifile job pid "self"
async_log "$name" "Running synchronously: %command"
$xtrace
$*
return $?
fi
}
# Shortcut for running a shell function async. Uses the function name as the
# async name.
function async_runfunc {
async_run $1 $*
}
# Wait for an async future to complete. May return immediately if already
# complete, or of the future has already been waited on (avoid this). May
# block until the future completes.
function async_wait {
local xtrace
xtrace=$(set +o | grep xtrace)
set +o xtrace
local pid rc running inifile runtime
rc=0
for name in $*; do
running=$(ls ${DEST}/async/*.ini 2>/dev/null | wc -l)
inifile="${DEST}/async/${name}.ini"
if pid=$(async_pidof "$name"); then
async_log "$name" "Waiting for completion of %command" \
"($running other jobs running)"
time_start async_wait
if [[ "$pid" != "self" ]]; then
# Do not actually call wait if we ran synchronously
if wait $pid; then
rc=0
else
rc=$?
fi
cat ${DEST}/async/${name}.log
fi
time_stop async_wait
local start_time
local end_time
start_time=$(iniget $inifile job start_time)
end_time=$(iniget $inifile job end_time)
_async_incr_bg_time $(($end_time - $start_time))
runtime=$((($end_time - $start_time) / 1000))
async_log "$name" "finished %command with result" \
"$rc in $runtime seconds"
rm -f $inifile
if [ $rc -ne 0 ]; then
echo Stopping async wait due to error: $*
break
fi
else
# This could probably be removed - it is really just here
# to help notice if you wait for something by the wrong
# name, but it also shows up for things we didn't start
# because they were not enabled.
echo Not waiting for async task $name that we never started or \
has already been waited for
fi
done
$xtrace
return $rc
}
# Check for uncollected futures and wait on them
function async_cleanup {
local name
if [[ "$DEVSTACK_PARALLEL" != "True" ]]; then
return 0
fi
for inifile in $(find ${DEST}/async -name '*.ini'); do
name=$(basename $pidfile .ini)
echo "WARNING: uncollected async future $name"
async_wait $name || true
done
}
# Make sure our async dir is created and clean
function async_init {
local async_dir=${DEST}/async
# Clean any residue if present from previous runs
rm -Rf $async_dir
# Make sure we have a state directory
mkdir -p $async_dir
}
function async_print_timing {
local bg_time_minus_wait
local elapsed_time
local serial_time
local speedup
if [[ "$DEVSTACK_PARALLEL" != "True" ]]; then
return 0
fi
# The logic here is: All the background task time would be
# serialized if we did not do them in the background. So we can
# add that to the elapsed time for the whole run. However, time we
# spend waiting for async things to finish adds to the elapsed
# time, but is time where we're not doing anything useful. Thus,
# we substract that from the would-be-serialized time.
bg_time_minus_wait=$((\
($_ASYNC_BG_TIME - ${_TIME_TOTAL[async_wait]}) / 1000))
elapsed_time=$(($(date "+%s") - $_TIME_BEGIN))
serial_time=$(($elapsed_time + $bg_time_minus_wait))
echo
echo "================="
echo " Async summary"
echo "================="
echo " Time spent in the background minus waits: $bg_time_minus_wait sec"
echo " Elapsed time: $elapsed_time sec"
echo " Time if we did everything serially: $serial_time sec"
echo " Speedup: " $(echo | awk "{print $serial_time / $elapsed_time}")
}

View File

@ -318,25 +318,25 @@ function create_keystone_accounts {
local admin_role="admin"
local member_role="member"
get_or_add_user_domain_role $admin_role $admin_user default
async_run ks-domain-role get_or_add_user_domain_role $admin_role $admin_user default
# Create service project/role
get_or_create_domain "$SERVICE_DOMAIN_NAME"
get_or_create_project "$SERVICE_PROJECT_NAME" "$SERVICE_DOMAIN_NAME"
async_run ks-project get_or_create_project "$SERVICE_PROJECT_NAME" "$SERVICE_DOMAIN_NAME"
# Service role, so service users do not have to be admins
get_or_create_role service
async_run ks-service get_or_create_role service
# The ResellerAdmin role is used by Nova and Ceilometer so we need to keep it.
# The admin role in swift allows a user to act as an admin for their project,
# but ResellerAdmin is needed for a user to act as any project. The name of this
# role is also configurable in swift-proxy.conf
get_or_create_role ResellerAdmin
async_run ks-reseller get_or_create_role ResellerAdmin
# another_role demonstrates that an arbitrary role may be created and used
# TODO(sleepsonthefloor): show how this can be used for rbac in the future!
local another_role="anotherrole"
get_or_create_role $another_role
async_run ks-anotherrole get_or_create_role $another_role
# invisible project - admin can't see this one
local invis_project
@ -349,10 +349,12 @@ function create_keystone_accounts {
demo_user=$(get_or_create_user "demo" \
"$ADMIN_PASSWORD" "default" "demo@example.com")
get_or_add_user_project_role $member_role $demo_user $demo_project
get_or_add_user_project_role $admin_role $admin_user $demo_project
get_or_add_user_project_role $another_role $demo_user $demo_project
get_or_add_user_project_role $member_role $demo_user $invis_project
async_wait ks-{domain-role,domain,project,service,reseller,anotherrole}
async_run ks-demo-member get_or_add_user_project_role $member_role $demo_user $demo_project
async_run ks-demo-admin get_or_add_user_project_role $admin_role $admin_user $demo_project
async_run ks-demo-another get_or_add_user_project_role $another_role $demo_user $demo_project
async_run ks-demo-invis get_or_add_user_project_role $member_role $demo_user $invis_project
# alt_demo
local alt_demo_project
@ -361,9 +363,9 @@ function create_keystone_accounts {
alt_demo_user=$(get_or_create_user "alt_demo" \
"$ADMIN_PASSWORD" "default" "alt_demo@example.com")
get_or_add_user_project_role $member_role $alt_demo_user $alt_demo_project
get_or_add_user_project_role $admin_role $admin_user $alt_demo_project
get_or_add_user_project_role $another_role $alt_demo_user $alt_demo_project
async_run ks-alt-member get_or_add_user_project_role $member_role $alt_demo_user $alt_demo_project
async_run ks-alt-admin get_or_add_user_project_role $admin_role $admin_user $alt_demo_project
async_run ks-alt-another get_or_add_user_project_role $another_role $alt_demo_user $alt_demo_project
# groups
local admin_group
@ -373,11 +375,15 @@ function create_keystone_accounts {
non_admin_group=$(get_or_create_group "nonadmins" \
"default" "non-admin group")
get_or_add_group_project_role $member_role $non_admin_group $demo_project
get_or_add_group_project_role $another_role $non_admin_group $demo_project
get_or_add_group_project_role $member_role $non_admin_group $alt_demo_project
get_or_add_group_project_role $another_role $non_admin_group $alt_demo_project
get_or_add_group_project_role $admin_role $admin_group $admin_project
async_run ks-group-memberdemo get_or_add_group_project_role $member_role $non_admin_group $demo_project
async_run ks-group-anotherdemo get_or_add_group_project_role $another_role $non_admin_group $demo_project
async_run ks-group-memberalt get_or_add_group_project_role $member_role $non_admin_group $alt_demo_project
async_run ks-group-anotheralt get_or_add_group_project_role $another_role $non_admin_group $alt_demo_project
async_run ks-group-admin get_or_add_group_project_role $admin_role $admin_group $admin_project
async_wait ks-demo-{member,admin,another,invis}
async_wait ks-alt-{member,admin,another}
async_wait ks-group-{memberdemo,anotherdemo,memberalt,anotheralt,admin}
if is_service_enabled ldap; then
create_ldap_domain

View File

@ -741,31 +741,36 @@ function create_nova_keys_dir {
sudo install -d -o $STACK_USER ${NOVA_STATE_PATH} ${NOVA_STATE_PATH}/keys
}
function init_nova_db {
local dbname="$1"
local conffile="$2"
recreate_database $dbname
$NOVA_BIN_DIR/nova-manage --config-file $conffile db sync --local_cell
}
# init_nova() - Initialize databases, etc.
function init_nova {
# All nova components talk to a central database.
# Only do this step once on the API node for an entire cluster.
if is_service_enabled $DATABASE_BACKENDS && is_service_enabled n-api; then
# (Re)create nova databases
async_run nova-cell-0 init_nova_db nova_cell0 $NOVA_CONF
for i in $(seq 1 $NOVA_NUM_CELLS); do
async_run nova-cell-$i init_nova_db nova_cell${i} $(conductor_conf $i)
done
recreate_database $NOVA_API_DB
$NOVA_BIN_DIR/nova-manage --config-file $NOVA_CONF api_db sync
recreate_database nova_cell0
# map_cell0 will create the cell mapping record in the nova_api DB so
# this needs to come after the api_db sync happens. We also want to run
# this before the db sync below since that will migrate both the nova
# and nova_cell0 databases.
# this needs to come after the api_db sync happens.
$NOVA_BIN_DIR/nova-manage cell_v2 map_cell0 --database_connection `database_connection_url nova_cell0`
# (Re)create nova databases
for i in $(seq 1 $NOVA_NUM_CELLS); do
recreate_database nova_cell${i}
$NOVA_BIN_DIR/nova-manage --config-file $(conductor_conf $i) db sync --local_cell
# Wait for DBs to finish from above
for i in $(seq 0 $NOVA_NUM_CELLS); do
async_wait nova-cell-$i
done
# Migrate nova and nova_cell0 databases.
$NOVA_BIN_DIR/nova-manage --config-file $NOVA_CONF db sync
# Run online migrations on the new databases
# Needed for flavor conversion
$NOVA_BIN_DIR/nova-manage --config-file $NOVA_CONF db online_data_migrations

View File

@ -330,6 +330,9 @@ if [[ ! -d $DATA_DIR ]]; then
safe_chmod 0755 $DATA_DIR
fi
# Create and/or clean the async state directory
async_init
# Configure proper hostname
# Certain services such as rabbitmq require that the local hostname resolves
# correctly. Make sure it exists in /etc/hosts so that is always true.
@ -1082,19 +1085,19 @@ if is_service_enabled keystone; then
create_keystone_accounts
if is_service_enabled nova; then
create_nova_accounts
async_runfunc create_nova_accounts
fi
if is_service_enabled glance; then
create_glance_accounts
async_runfunc create_glance_accounts
fi
if is_service_enabled cinder; then
create_cinder_accounts
async_runfunc create_cinder_accounts
fi
if is_service_enabled neutron; then
create_neutron_accounts
async_runfunc create_neutron_accounts
fi
if is_service_enabled swift; then
create_swift_accounts
async_runfunc create_swift_accounts
fi
fi
@ -1107,9 +1110,11 @@ write_clouds_yaml
if is_service_enabled horizon; then
echo_summary "Configuring Horizon"
configure_horizon
async_runfunc configure_horizon
fi
async_wait create_nova_accounts create_glance_accounts create_cinder_accounts
async_wait create_neutron_accounts create_swift_accounts configure_horizon
# Glance
# ------
@ -1117,7 +1122,7 @@ fi
# NOTE(yoctozepto): limited to node hosting the database which is the controller
if is_service_enabled $DATABASE_BACKENDS && is_service_enabled glance; then
echo_summary "Configuring Glance"
init_glance
async_runfunc init_glance
fi
@ -1131,7 +1136,7 @@ if is_service_enabled neutron; then
# Run init_neutron only on the node hosting the Neutron API server
if is_service_enabled $DATABASE_BACKENDS && is_service_enabled neutron; then
init_neutron
async_runfunc init_neutron
fi
fi
@ -1161,7 +1166,7 @@ fi
if is_service_enabled swift; then
echo_summary "Configuring Swift"
init_swift
async_runfunc init_swift
fi
@ -1170,7 +1175,7 @@ fi
if is_service_enabled cinder; then
echo_summary "Configuring Cinder"
init_cinder
async_runfunc init_cinder
fi
# Placement Service
@ -1178,9 +1183,16 @@ fi
if is_service_enabled placement; then
echo_summary "Configuring placement"
init_placement
async_runfunc init_placement
fi
# Wait for neutron and placement before starting nova
async_wait init_neutron
async_wait init_placement
async_wait init_glance
async_wait init_swift
async_wait init_cinder
# Compute Service
# ---------------
@ -1192,7 +1204,7 @@ if is_service_enabled nova; then
# TODO(stephenfin): Is it possible for neutron to *not* be enabled now? If
# not, remove the if here
if is_service_enabled neutron; then
configure_neutron_nova
async_runfunc configure_neutron_nova
fi
fi
@ -1236,6 +1248,8 @@ if is_service_enabled cinder; then
iniset $CINDER_CONF key_manager fixed_key "$FIXED_KEY"
fi
async_wait configure_neutron_nova
# Launch the nova-api and wait for it to answer before continuing
if is_service_enabled n-api; then
echo_summary "Starting Nova API"
@ -1282,7 +1296,7 @@ fi
if is_service_enabled nova; then
echo_summary "Starting Nova"
start_nova
create_flavors
async_runfunc create_flavors
fi
if is_service_enabled cinder; then
echo_summary "Starting Cinder"
@ -1331,6 +1345,8 @@ if is_service_enabled horizon; then
start_horizon
fi
async_wait create_flavors
# Create account rc files
# =======================
@ -1467,8 +1483,12 @@ else
exec 1>&3
fi
# Make sure we didn't leak any background tasks
async_cleanup
# Dump out the time totals
time_totals
async_print_timing
# Using the cloud
# ===============

View File

@ -184,3 +184,4 @@ if is_service_enabled cinder && is_package_installed lvm2; then
fi
clean_pyc_files
rm -Rf $DEST/async