From c38dd767118164aae37613d557ed23691813f617 Mon Sep 17 00:00:00 2001
From: Mark Goddard <mark@stackhpc.com>
Date: Tue, 2 Jul 2019 08:30:02 +0100
Subject: [PATCH] Wait for all compute services before cell discovery

There is a race condition during nova deploy since we wait for at least
one compute service to register itself before performing cells v2 host
discovery.  It's quite possible that other compute nodes will not yet
have registered and will therefore not be discovered. This leaves them
not mapped into a cell, and results in the following error if the
scheduler picks one when booting an instance:

Host 'xyz' is not mapped to any cell

The problem has been exacerbated by merging a fix [1][2] for a nova race
condition, which disabled the dynamic periodic discovery mechanism in
the nova scheduler.

This change fixes the issue by waiting for all expected compute services
to register themselves before performing host discovery. This includes
both virtualised compute services and bare metal compute services.

[1] https://bugs.launchpad.net/kolla-ansible/+bug/1832987
[2] https://review.opendev.org/665554

Change-Id: I2915e2610e5c0b8d67412e7ec77f7575b8fe9921
Closes-Bug: #1835002
---
 .../roles/nova/tasks/discover_computes.yml    | 45 +++++++++++++++++--
 1 file changed, 42 insertions(+), 3 deletions(-)

diff --git a/ansible/roles/nova/tasks/discover_computes.yml b/ansible/roles/nova/tasks/discover_computes.yml
index 647bd5d59c..37d4ed0c97 100644
--- a/ansible/roles/nova/tasks/discover_computes.yml
+++ b/ansible/roles/nova/tasks/discover_computes.yml
@@ -1,5 +1,44 @@
 ---
-- name: Waiting for nova-compute service up
+# We need to wait for all expected compute services to register before running
+# cells v2 host discovery. This includes virtualised compute services and
+# ironic compute services.
+# Work with --limit by including only hosts in ansible_play_batch.
+- name: Build a list of expected compute service hosts
+  vars:
+    # For virt, use ansible_nodename rather than inventory_hostname, since this
+    # is similar to what nova uses internally as its default for the
+    # [DEFAULT] host config option.
+    virt_compute_service_hosts: >-
+      {{ groups['compute'] |
+         intersect(ansible_play_batch) |
+         map('extract', hostvars, 'ansible_nodename') |
+         list }}
+    # For ironic, use {{ansible_hostname}}-ironic since this is what we
+    # configure for [DEFAULT] host in nova.conf.
+    ironic_compute_service_hosts: >-
+      {{ (groups['nova-compute-ironic'] |
+          intersect(ansible_play_batch) |
+          map('extract', hostvars, 'ansible_hostname') |
+          map('regex_replace', '^(.*)$', '\1-ironic') |
+          list)
+         if enable_ironic | bool else [] }}
+  set_fact:
+    expected_compute_service_hosts: "{{ virt_compute_service_hosts + ironic_compute_service_hosts }}"
+  run_once: True
+  delegate_to: "{{ groups['nova-api'][0] }}"
+
+- name: Waiting for nova-compute services to register themselves
+  vars:
+    # A list containing the 'Host' field of compute services that have
+    # registered themselves.  Don't exclude compute services that are disabled
+    # since these could have been explicitly disabled by the operator. While we
+    # could exclude services that are down, the nova-manage cell_v2
+    # discover_hosts does not do this so let's not block on it here.
+    found_compute_service_hosts: >-
+      {{ nova_compute_services.stdout |
+         from_json |
+         map(attribute='Host') |
+         list }}
   become: true
   command: >
     docker exec kolla_toolbox openstack
@@ -12,7 +51,7 @@
     --os-password {{ keystone_admin_password }}
     --os-user-domain-name {{ openstack_auth.domain_name }}
     --os-region-name {{ openstack_region_name }}
-    compute service list -f json --service nova-compute
+    compute service list --format json --column Host --service nova-compute
   register: nova_compute_services
   changed_when: false
   run_once: True
@@ -21,7 +60,7 @@
   delay: 10
   until:
     - nova_compute_services is success
-    - nova_compute_services.stdout | from_json | length != 0
+    - found_compute_service_hosts is superset(expected_compute_service_hosts)
 
 # TODO(yoctozepto): no need to do --by-service if ironic not used
 - name: Discover nova hosts