Dave Wilde 482e845d92 Improve multi-node AIO robustness
In order to improve the readability and robustness of the mnaio feature
I have replaced the shell out to virsh tasks to use the virt module
where available.  I have also created a vm-status play that will
hopefully help resolve SSH failures into the VMs.  This play utilizes
the block/rescue/handler pattern to attempt to restart the VM once if
it fails the initial SSH check.  Hopefully this will reduce the SSH
failures due to a suck VM.  This adds a new variable called
vm_ssh_timeout which allows the deployer an easy place to override the
default timeout.  The python-lxml package is needed for the virt module.

Change-Id: I027556b71a8c26d08a56b4ffa56b2eeaf1cbabe9
2018-06-29 10:12:16 -05:00

78 lines
2.4 KiB
YAML

---
# Copyright 2018, Rackspace US, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in witing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
- name: Create vm_servers group
hosts: localhost
gather_facts: false
tasks:
- name: VM Servers group
add_host:
name: "{{ item }}"
groups: vm_servers
when:
- hostvars[item]['server_vm'] | default(false) | bool
with_items: "{{ groups['pxe_servers'] }}"
- name: VM Status
hosts: vm_servers
gather_facts: false
tasks:
- name: VM Connectivity Check
block:
- name: Wait for VM
wait_for_connection:
connect_timeout: 10
port: 22
sleep: 20
timeout: "{{ vm_ssh_timeout }}"
rescue:
- name: Gather VM info (rescue)
virt:
command: status
name: "{{ inventory_hostname }}"
connection: local
register: vm_info
- name: Stop VM (rescue)
virt:
command: destroy
name: "{{ inventory_hostname }}"
connection: local
when: vm_info.status == 'running'
- name: Start VM (rescue)
virt:
command: start
name: "{{ inventory_hostname }}"
connection: local
- name: Wait for VM (rescue)
wait_for_connection:
connect_timeout: 10
port: 22
sleep: 20
timeout: "{{ vm_ssh_timeout }}"
register: vm_rescue
ignore_errors: true
- name: Gather VM info 2nd pass (rescue)
virt:
command: status
name: "{{ inventory_hostname }}"
connection: local
register: vm_info_2
- name: Fail if VM still offline (rescue)
fail:
msg: "{{ inventory_hostname }} is not responding and cannot be rescued"
when:
- vm_info_2.status != 'running'
- vm_rescue.failed == 'true'