49826469b4
Via Ib3ea6de7f235d2a2d53a6576e0876ab171128b34 we make sure that we use the --live-restore docker option by default so we can avoid docker service restarts bringing down the whole control-plane. While this fixes new deployments, it leaves a problematic window open in case an operator already deployed pike and wants to do a minor update. In such case, the change mentioned above will fix things, but might be disruptive at the very first call: that's because the first docker restart to push the --live-restore option will trigger a docker restart while the cluster is up which will bring down all containers at the wrong time (i.e. without pacemaker knowing about it) In order to avoid this condition, let's stop all docker containers and the docker service, let's update the docker package and then let's run apply puppet for the docker profile (which will make sure the docker service is started). This simple approach avoids any potential docker service restarts due to puppet changes. It should be safe to do this because at step2 we are guaranteed that the cluster is down on that node and so the API services will not be reachable anyway. This way before bringing the cluster back up we have a docker service which is running with the live-restore option and will be more resilient in the face of docker service restarts. It can be argued that we should stop all containers and restart docker only when there is either a docker package update or if puppet will trigger a restart. Due to the puppet bug [1] it becomes rather hackish to detect if puppet would restart the resource before actually running it. That is one reason why in this pass we do it all the time, the other reason being that paunch might trigger a restart of most services anyway, so there is not that much point in protecting ourselves in this part of the code. The control plane in any case is usually largely unaffected by this, since pacemaker will move all the resources to the other nodes. In the case of a compute node an operator should be aware that services might be restarted and that critical workloads should be migrated to other compute nodes before a minor update. Updated 7 different controller nodes with this patch and they all worked correctly NB: we did hit a race with older docker 1.12 versions see RHBZ#1545356. 1.13 versions were fine. Co-Authored-By: Damien Ciabrini <dciabrin@redhat.com> [1] https://tickets.puppetlabs.com/browse/PUP-686 Change-Id: Ic08468854ce92e81cd84bd6c86a6b672b5a9d49b Related-Bug: #1747851
125 lines
4.6 KiB
YAML
125 lines
4.6 KiB
YAML
heat_template_version: queens
|
|
|
|
description: >
|
|
Configures docker on the host
|
|
|
|
parameters:
|
|
DockerInsecureRegistryAddress:
|
|
description: Optional. The IP Address and Port of an insecure docker
|
|
namespace that will be configured in /etc/sysconfig/docker.
|
|
The value can be multiple addresses separated by commas.
|
|
type: comma_delimited_list
|
|
default: []
|
|
DockerRegistryMirror:
|
|
description: Optional. Configure a registry-mirror in the /etc/docker/daemon.json file.
|
|
default: ''
|
|
type: string
|
|
EndpointMap:
|
|
default: {}
|
|
description: Mapping of service endpoint -> protocol. Typically set
|
|
via parameter_defaults in the resource registry.
|
|
type: json
|
|
ServiceData:
|
|
default: {}
|
|
description: Dictionary packing service data
|
|
type: json
|
|
ServiceNetMap:
|
|
default: {}
|
|
description: Mapping of service_name -> network name. Typically set
|
|
via parameter_defaults in the resource registry. This
|
|
mapping overrides those in ServiceNetMapDefaults.
|
|
type: json
|
|
DefaultPasswords:
|
|
default: {}
|
|
type: json
|
|
RoleName:
|
|
default: ''
|
|
description: Role name on which the service is applied
|
|
type: string
|
|
RoleParameters:
|
|
default: {}
|
|
description: Parameters specific to the role
|
|
type: json
|
|
Debug:
|
|
type: boolean
|
|
default: false
|
|
description: Set to True to enable debugging on all services.
|
|
DockerDebug:
|
|
default: ''
|
|
description: Set to True to enable debugging Docker services.
|
|
type: string
|
|
constraints:
|
|
- allowed_values: [ '', 'true', 'True', 'TRUE', 'false', 'False', 'FALSE']
|
|
DockerOptions:
|
|
default: '--log-driver=journald --signature-verification=false --iptables=false --live-restore'
|
|
description: Options that are used to startup the docker service.
|
|
type: string
|
|
DeploymentUser:
|
|
default: ''
|
|
description: User added to the docker group in order to use container commands.
|
|
type: string
|
|
|
|
conditions:
|
|
insecure_registry_is_empty: {equals : [{get_param: DockerInsecureRegistryAddress}, []]}
|
|
insecure_registry_mirror_is_empty: {equals : [{get_param: DockerRegistryMirror}, '']}
|
|
service_debug_unset: {equals : [{get_param: DockerDebug}, '']}
|
|
deployment_user_is_empty: {equals : [{get_param: DeploymentUser}, '']}
|
|
|
|
outputs:
|
|
role_data:
|
|
description: Role data for the docker service
|
|
value:
|
|
service_name: docker
|
|
config_settings:
|
|
map_merge:
|
|
- tripleo::profile::base::docker::configure_network: true
|
|
tripleo::profile::base::docker::network_options: "--bip=172.31.0.1/24"
|
|
tripleo::profile::base::docker::docker_options: {get_param: DockerOptions}
|
|
tripleo::profile::base::docker::debug:
|
|
if:
|
|
- service_debug_unset
|
|
- {get_param: Debug }
|
|
- {get_param: DockerDebug}
|
|
-
|
|
if:
|
|
- insecure_registry_is_empty
|
|
- {}
|
|
- tripleo::profile::base::docker::insecure_registries: {get_param: DockerInsecureRegistryAddress}
|
|
-
|
|
if:
|
|
- insecure_registry_mirror_is_empty
|
|
- {}
|
|
- tripleo::profile::base::docker::registry_mirror: {get_param: DockerRegistryMirror}
|
|
-
|
|
if:
|
|
- deployment_user_is_empty
|
|
- {}
|
|
- tripleo::profile::base::docker::deployment_user: {get_param: DeploymentUser}
|
|
step_config: |
|
|
include ::tripleo::profile::base::docker
|
|
upgrade_tasks:
|
|
- name: Install docker packages on upgrade if missing
|
|
when: step|int == 3
|
|
yum: name=docker state=latest
|
|
update_tasks:
|
|
block:
|
|
- name: Stop all containers
|
|
# xargs is preferable to docker stop $(docker ps -q) as that might generate a
|
|
# a too long command line
|
|
shell: docker ps -q | xargs --no-run-if-empty -n1 docker stop
|
|
- name: Stop docker
|
|
service:
|
|
name: docker
|
|
state: stopped
|
|
- name: Update the docker package
|
|
yum: name=docker state=latest update_cache=yes # cache for tripleo/+bug/1703830
|
|
- name: Apply puppet which will start the service again
|
|
shell: |
|
|
puppet apply --detailed-exitcodes --verbose \
|
|
--modulepath /etc/puppet/modules:/opt/stack/puppet-modules:/usr/share/openstack-puppet/modules \
|
|
-e "class { 'tripleo::profile::base::docker': step => 1, }"
|
|
register: puppet_docker_apply
|
|
failed_when: puppet_docker_apply.rc not in [0, 2]
|
|
changed_when: puppet_docker_apply.rc == 2
|
|
when: step|int == 2
|