From c2269d70a23b55c459233ab5fc28362b7c2ca766 Mon Sep 17 00:00:00 2001
From: Vasyl Saienko <vsaienko@mirantis.com>
Date: Mon, 18 Nov 2024 08:17:12 +0000
Subject: [PATCH] [mariadb] Use service IP to discover endpoints

It was observed that under certain circumstances
galera instances can use old IP address of the node
after pod restart. This patch changes the value of
wsrep_cluster_address variable - instead of listing
all dns names of the cluster nodes the discovery service
IP address is used. In this case cluster_node_address is set to IP
address instead of DNS name - otherwise SST method will fail.

Co-Authored-By: Oleksii Grudev <ogrudev@mirantis.com>

Change-Id: I8059f28943150785abd48316514c0ffde56dfde5
---
 mariadb/Chart.yaml                       |  2 +-
 mariadb/templates/bin/_start.py.tpl      | 30 ++++++++++--------------
 mariadb/templates/service-discovery.yaml |  2 +-
 releasenotes/notes/mariadb.yaml          |  1 +
 4 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/mariadb/Chart.yaml b/mariadb/Chart.yaml
index 6a2d1023f..e4db23543 100644
--- a/mariadb/Chart.yaml
+++ b/mariadb/Chart.yaml
@@ -15,7 +15,7 @@ apiVersion: v1
 appVersion: v10.6.7
 description: OpenStack-Helm MariaDB
 name: mariadb
-version: 0.2.62
+version: 0.2.63
 home: https://mariadb.com/kb/en/
 icon: http://badges.mariadb.org/mariadb-badge-180x60.png
 sources:
diff --git a/mariadb/templates/bin/_start.py.tpl b/mariadb/templates/bin/_start.py.tpl
index 84dd01eac..90fea03f9 100644
--- a/mariadb/templates/bin/_start.py.tpl
+++ b/mariadb/templates/bin/_start.py.tpl
@@ -49,6 +49,10 @@ logger.addHandler(ch)
 local_hostname = socket.gethostname()
 logger.info("This instance hostname: {0}".format(local_hostname))
 
+# Get local node IP address
+local_ip = socket.gethostbyname(local_hostname)
+logger.info("This instance IP address: {0}".format(local_ip))
+
 # Get the instance number
 instance_number = local_hostname.split("-")[-1]
 logger.info("This instance number: {0}".format(instance_number))
@@ -270,18 +274,14 @@ def mysqld_write_cluster_conf(mode='run'):
     for node in range(int(mariadb_replicas)):
         node_hostname = "{0}-{1}".format(pod_name_prefix, node)
         if local_hostname == node_hostname:
-            wsrep_node_address = "{0}.{1}:{2}".format(
-                node_hostname, discovery_domain, wsrep_port)
-            cluster_config_params['wsrep_node_address'] = wsrep_node_address
+            cluster_config_params['wsrep_node_address'] = local_ip
             wsrep_node_name = "{0}.{1}".format(node_hostname, discovery_domain)
             cluster_config_params['wsrep_node_name'] = wsrep_node_name
-        else:
-            addr = "{0}.{1}:{2}".format(node_hostname, discovery_domain,
-                                        wsrep_port)
-            wsrep_cluster_members.append(addr)
-    if wsrep_cluster_members and mode == 'run':
-        cluster_config_params['wsrep_cluster_address'] = "gcomm://{0}".format(
-            ",".join(wsrep_cluster_members))
+
+    if mode == 'run':
+        cluster_config_params['wsrep_cluster_address'] = "gcomm://{0}:{1}".format(
+            discovery_domain, wsrep_port)
+
     else:
         cluster_config_params['wsrep_cluster_address'] = "gcomm://"
     cluster_config_file = '/etc/mysql/conf.d/10-cluster-config.cnf'
@@ -913,14 +913,6 @@ def run_mysqld(cluster='existing'):
             "This is a fresh node joining the cluster for the 1st time, not attempting to set admin passwords or upgrading"
         )
 
-    # Node ready to start MariaDB, update cluster state to live and remove
-    # reboot node info, if set previously.
-    if cluster == 'new':
-        set_configmap_annotation(
-            key='openstackhelm.openstack.org/cluster.state', value='live')
-        set_configmap_annotation(
-            key='openstackhelm.openstack.org/reboot.node', value='')
-
     logger.info("Launching MariaDB")
     run_cmd_with_logging(mysqld_cmd, logger)
 
@@ -1003,6 +995,8 @@ elif get_cluster_state() == 'live':
                     "it")
                 while not check_for_active_nodes():
                     time.sleep(default_sleep)
+                set_configmap_annotation(
+                    key='openstackhelm.openstack.org/cluster.state', value='live')
                 run_mysqld()
 elif get_cluster_state() == 'reboot':
     reboot_node = get_configmap_value(
diff --git a/mariadb/templates/service-discovery.yaml b/mariadb/templates/service-discovery.yaml
index 378878c06..d5efd3131 100644
--- a/mariadb/templates/service-discovery.yaml
+++ b/mariadb/templates/service-discovery.yaml
@@ -30,7 +30,7 @@ spec:
     - name: sst
       port: {{ tuple "oslo_db" "direct" "sst" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
   clusterIP: None
-  publishNotReadyAddresses: true
+  publishNotReadyAddresses: false
   selector:
 {{ tuple $envAll "mariadb" "server" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 4 }}
 {{ .Values.network.mariadb_discovery | include "helm-toolkit.snippets.service_params" | indent 2 }}
diff --git a/releasenotes/notes/mariadb.yaml b/releasenotes/notes/mariadb.yaml
index 89cab1a10..823b0f41d 100644
--- a/releasenotes/notes/mariadb.yaml
+++ b/releasenotes/notes/mariadb.yaml
@@ -78,4 +78,5 @@ mariadb:
   - 0.2.60 Refactor liveness/readiness probes
   - 0.2.61 Avoid using deprecated isAlive()
   - 0.2.62 Implement mariadb upgrade during start
+  - 0.2.63 Use service ip for endpoint discovery
 ...