From 192dcd1e1b9baf7f3177a694c2b1ce8bd62d9159 Mon Sep 17 00:00:00 2001
From: Mark Goddard <mark@stackhpc.com>
Date: Fri, 22 Mar 2019 14:59:41 +0000
Subject: [PATCH] Fix booting instances after nova-compute upgrade

After upgrading from Rocky to Stein, nova-compute services fail to start
new instances with the following error message:

Failed to allocate the network(s), not rescheduling.

Looking in the nova-compute logs, we also see this:

Neutron Reported failure on event
network-vif-plugged-60c05a0d-8758-44c9-81e4-754551567be5 for instance
32c493c4-d88c-4f14-98db-c7af64bf3324: NovaException: In shutdown, no new
events can be scheduled

During the upgrade process, we send nova containers a SIGHUP to cause
them to reload their object version state. Speaking to the nova team in
IRC, there is a known issue with this, caused by oslo.service performing
a full shutdown in response to a SIGHUP, which breaks nova-compute.
There is a patch [1] in review to address this.

The workaround employed here is to restart the nova compute service.

[1] https://review.openstack.org/#/c/641907

Change-Id: Ia4fcc558a3f62ced2d629d7a22d0bc1eb6b879f1
Closes-Bug: #1821362
---
 ansible/roles/nova/defaults/main.yml | 11 +++++++
 ansible/roles/nova/tasks/config.yml  | 12 +------
 ansible/roles/nova/tasks/reload.yml  | 49 +++++++++++++++++++---------
 3 files changed, 45 insertions(+), 27 deletions(-)

diff --git a/ansible/roles/nova/defaults/main.yml b/ansible/roles/nova/defaults/main.yml
index 8b0152ef3d..8081b7ad45 100644
--- a/ansible/roles/nova/defaults/main.yml
+++ b/ansible/roles/nova/defaults/main.yml
@@ -357,6 +357,17 @@ nova_safety_upgrade: "no"
 nova_libvirt_port: "16509"
 nova_ssh_port: "8022"
 
+nova_services_require_nova_conf:
+  - nova-api
+  - nova-compute
+  - nova-compute-ironic
+  - nova-conductor
+  - nova-consoleauth
+  - nova-novncproxy
+  - nova-serialproxy
+  - nova-scheduler
+  - nova-spicehtml5proxy
+
 ####################
 # Notification
 ####################
diff --git a/ansible/roles/nova/tasks/config.yml b/ansible/roles/nova/tasks/config.yml
index 60cab45be8..6c0349e2dc 100644
--- a/ansible/roles/nova/tasks/config.yml
+++ b/ansible/roles/nova/tasks/config.yml
@@ -81,16 +81,6 @@
 - name: Copying over nova.conf
   become: true
   vars:
-    services_require_nova_conf:
-      - nova-api
-      - nova-compute
-      - nova-compute-ironic
-      - nova-conductor
-      - nova-consoleauth
-      - nova-novncproxy
-      - nova-serialproxy
-      - nova-scheduler
-      - nova-spicehtml5proxy
     service_name: "{{ item.key }}"
   merge_configs:
     sources:
@@ -105,7 +95,7 @@
   when:
     - inventory_hostname in groups[item.value.group]
     - item.value.enabled | bool
-    - item.key in services_require_nova_conf
+    - item.key in nova_services_require_nova_conf
   with_dict: "{{ nova_services }}"
   notify:
     - "Restart {{ item.key }} container"
diff --git a/ansible/roles/nova/tasks/reload.yml b/ansible/roles/nova/tasks/reload.yml
index 6b37a9091d..96f57f9681 100644
--- a/ansible/roles/nova/tasks/reload.yml
+++ b/ansible/roles/nova/tasks/reload.yml
@@ -1,21 +1,38 @@
 ---
 # This play calls sighup on every service to refresh upgrade levels
-- name: Sighup nova-api
-  command: docker exec -t nova_api kill -1 1
-  when: inventory_hostname in groups['nova-api']
 
-- name: Sighup nova-conductor
-  command: docker exec -t nova_conductor kill -1 1
-  when: inventory_hostname in groups['nova-conductor']
+# NOTE(mgoddard): Currently (just prior to Stein release), sending SIGHUP to
+# nova compute services leaves them in a broken state in which they cannot
+# start new instances. The following error is seen in the logs:
+# "In shutdown, no new events can be scheduled"
+# To work around this we restart the nova-compute services.
+# Speaking to the nova team, this seems to be an issue in oslo.service,
+# with a fix proposed here: https://review.openstack.org/#/c/641907.
+# This issue also seems to affect the proxy services, which exit non-zero in
+# reponse to a SIGHUP, so restart those too.
+# TODO(mgoddard): Remove this workaround when this bug has been fixed.
 
-- name: Sighup nova-consoleauth
-  command: docker exec -t nova_consoleauth kill -1 1
-  when: inventory_hostname in groups['nova-consoleauth']
+- name: Send SIGHUP to nova services
+  become: true
+  command: docker exec -t {{ item.value.container_name }} kill -1 1
+  when:
+    - inventory_hostname in groups[item.value.group]
+    - item.value.enabled | bool
+    - item.key in nova_services_require_nova_conf
+    - not item.key.startswith('nova-compute')
+    - not item.key.endswith('proxy')
+  with_dict: "{{ nova_services }}"
 
-- name: Sighup nova-scheduler
-  command: docker exec -t nova_scheduler kill -1 1
-  when: inventory_hostname in groups['nova-scheduler']
-
-- name: Sighup nova-compute
-  command: docker exec -t nova_compute kill -1 1
-  when: inventory_hostname in groups['compute']
+- name: Restart nova compute and proxy services
+  become: true
+  kolla_docker:
+    action: restart_container
+    common_options: "{{ docker_common_options }}"
+    name: "{{ item.value.container_name }}"
+  when:
+    - inventory_hostname in groups[item.value.group]
+    - item.value.enabled | bool
+    - item.key in nova_services_require_nova_conf
+    - item.key.startswith('nova-compute')
+      or item.key.endswith('proxy')
+  with_dict: "{{ nova_services }}"