diff --git a/releasenotes/notes/avoid-diverged-slave-when-migrating-mariadb-master-37e2429a1ea75913.yaml b/releasenotes/notes/avoid-diverged-slave-when-migrating-mariadb-master-37e2429a1ea75913.yaml new file mode 100644 index 0000000000..f2c3f04ca9 --- /dev/null +++ b/releasenotes/notes/avoid-diverged-slave-when-migrating-mariadb-master-37e2429a1ea75913.yaml @@ -0,0 +1,12 @@ +--- +fixes: + - | + MariaDB allows an server to be a master and a slave simutaneously, so when + migrating masters, if the old master is reactivated before attaching the + other replicas to the new master, new unexpected GTIDs may be created on + the old master and synced to some of the other replicas by chance, as the + other replicas are still connecting to the old one by the time. After that + these diverged slave will fail changing to the new master. This will be + fixed by first attaching the other replicas to the new master, and then + dealing with old master. + Fixes #1754539 diff --git a/trove/taskmanager/manager.py b/trove/taskmanager/manager.py index e600e7adf8..7559c1ecf9 100644 --- a/trove/taskmanager/manager.py +++ b/trove/taskmanager/manager.py @@ -99,6 +99,26 @@ class Manager(periodic_task.PeriodicTasks): replica_models): # First, we transition from the old master to new as quickly as # possible to minimize the scope of unrecoverable error + + # NOTE(zhaochao): we cannot reattach the old master to the new + # one immediately after the new master is up, because for MariaDB + # the other replicas are still connecting to the old master, and + # during reattaching the old master as a slave, new GTID may be + # created and synced to the replicas. After that, when attaching + # the replicas to the new master, 'START SLAVE' will fail by + # 'fatal error 1236' if the binlog of the replica diverged from + # the new master. So the proper order should be: + # -1. make the old master read only (and detach floating ips) + # -2. make sure the new master is up-to-date + # -3. detach the new master from the old one + # -4. enable the new master (and attach floating ips) + # -5. attach the other replicas to the new master + # -6. attach the old master to the new one + # (and attach floating ips) + # -7. demote the old master + # What we changed here is the order of the 6th step, previously + # this step took place right after step 4, which causes failures + # with MariaDB replications. old_master.make_read_only(True) master_ips = old_master.detach_public_ips() slave_ips = master_candidate.detach_public_ips() @@ -106,10 +126,8 @@ class Manager(periodic_task.PeriodicTasks): master_candidate.wait_for_txn(latest_txn_id) master_candidate.detach_replica(old_master, for_failover=True) master_candidate.enable_as_master() - old_master.attach_replica(master_candidate) master_candidate.attach_public_ips(master_ips) master_candidate.make_read_only(False) - old_master.attach_public_ips(slave_ips) # At this point, should something go wrong, there # should be a working master with some number of working slaves, @@ -138,6 +156,10 @@ class Manager(periodic_task.PeriodicTasks): error_messages += "%s (%s)\n" % ( exc_fmt % msg_content, ex) + # dealing with the old master after all the other replicas + # has been migrated. + old_master.attach_replica(master_candidate) + old_master.attach_public_ips(slave_ips) try: old_master.demote_replication_master() except Exception as ex: