MGR集群重新添加无效节点

原创三石 All China Database Union 2024-05-09

1433

1、简介

管理中有一套MySQL 8.0.33的MGR集群，因为是测试库，当时节点2、3都接连宕了也没注意，因为主库在安稳的给开发那边提供使用，当我发现的时候已经断了很久，又因为是测试库，直到今天才重新想着同步好。当初宕的原因就不去纠结和定位了。

2、操作步骤

2.1重装节点2、3的MySQL数据库

模板可以看我以前的操作，但是需要新添加的一些参数和设置如下

节点1

[Mysql]
disabled_storage_engines="MyISAM,BLACKHOLE,FEDERATED,ARCHIVE,MEMORY"
#server_id=100
gtid_mode=ON
enforce_gtid_consistency=ON
#log_bin=binlog
plugin_dir=/mysql/app/mysql8.0.33/lib/plugin/
log_slave_updates=ON
binlog_format=ROW
master_info_repository=TABLE
relay_log_info_repository=TABLE
transaction_write_set_extraction=XXHASH64
binlog_transaction_dependency_tracking=WRITESET
slave_parallel_type=LOGICAL_CLOCK
slave_preserve_commit_order=1
#组复制相关参数
group_replication_start_on_boot=off
group_replication_local_address="mjndfev1:33061"
group_replication_group_seeds="mjndfev1:33061,mjndfev2:33061,mjndfev3:33061"
group_replication_bootstrap_group=off
#Plugin插件
plugin-load-add="mysql_clone.so;group_replication.so"
clone=FORCE_PLUS_PERMANENT

节点2

server_id=101
group_replication_local_address="mjndfev2:33061"

节点3

server_id=102
group_replication_local_address="mjndfev3:33061"

请记得重启数据库生效参数

2.2创建集群用户并赋权

create user 'mgr_user'@'%' identified by 'aaaa';
grant clone_admin, connection_admin, create user, execute, file,group_replication_admin, persist_ro_variables_admin, process, reload,replication client, replication slave, replication_applier, replication_slave_admin, role_admin, select, shutdown, system_variables_admin on *.* to 'mgr_user'@'%' with grant option;
grant delete, insert, update on mysql.* to 'mgr_user'@'%' with grant option;
GRANT ALTER, ALTER ROUTINE, CREATE, CREATE ROUTINE, CREATE TEMPORARY TABLES, CREATE VIEW, DELETE, DROP, EVENT, INDEX, INSERT, LOCK TABLES, REFERENCES, SHOW VIEW, TRIGGER, UPDATE ON mysql_innodb_cluster_metadata.* TO 'mgr_user'@'%' WITH GRANT OPTION;
GRANT ALTER, ALTER ROUTINE, CREATE, CREATE ROUTINE, CREATE TEMPORARY TABLES, CREATE VIEW, DELETE, DROP, EVENT, INDEX, INSERT, LOCK TABLES, REFERENCES, SHOW VIEW, TRIGGER, UPDATE ON mysql_innodb_cluster_metadata_bkp.* TO 'mgr_user'@'%' WITH GRANT OPTION;
GRANT ALTER, ALTER ROUTINE, CREATE, CREATE ROUTINE, CREATE TEMPORARY TABLES, CREATE VIEW, DELETE, DROP, EVENT, INDEX, INSERT, LOCK TABLES, REFERENCES, SHOW VIEW, TRIGGER, UPDATE ON mysql_innodb_cluster_metadata_previous.* TO 'mgr_user'@'%' WITH GRANT OPTION;
GRANT ALTER, ALTER ROUTINE, CREATE, CREATE ROUTINE, CREATE TEMPORARY TABLES, CREATE VIEW, DELETE, DROP, EVENT, INDEX, INSERT, LOCK TABLES, REFERENCES, SHOW VIEW, TRIGGER, UPDATE ON mysql_innodb_cluster_metadata_bkp.* TO 'mgr_user'@'%' WITH GRANT OPTION;
GRANT ALTER, ALTER ROUTINE, CREATE, CREATE ROUTINE, CREATE TEMPORARY TABLES, CREATE VIEW, DELETE, DROP, EVENT, INDEX, INSERT, LOCK TABLES, REFERENCES, SHOW VIEW, TRIGGER, UPDATE ON mysql_innodb_cluster_metadata_previous.* TO 'mgr_user'@'%' WITH GRANT OPTION;

2.3使用MySQL Shell登录创建集群

2.3.1登录MySQL Shell

mysqlsh -umgr_user -p'aaaa'  -P 3306 -S /mysql/data/mysql3306/socket/mysql.sock

2.3.2重新定义集群

 MySQL localhost JS > var cluster = dba.getCluster('Cluster01')

2.3.3 查看集群状态

MySQL  localhost  JS > cluster.status();
{
    "clusterName": "Cluster01", 
    "defaultReplicaSet": {
        "name": "default", 
        "primary": "node1:3306", 
        "ssl": "REQUIRED", 
        "status": "OK_NO_TOLERANCE_PARTIAL", 
        "statusText": "Cluster is NOT tolerant to any failures. 2 members are not active.", 
        "topology": {
            "node1:3306": {
                "address": "node1:3306", 
                "memberRole": "PRIMARY", 
                "mode": "R/W", 
                "readReplicas": {}, 
                "replicationLag": "applier_queue_applied", 
                "role": "HA", 
                "status": "ONLINE", 
                "version": "8.0.33"
            }, 
            "node2:3306": {
                "address": "node2:3306", 
                "instanceErrors": [
                    "WARNING: Instance is NOT a PRIMARY but super_read_only option is OFF.", 
                    "NOTE: group_replication is stopped.", 
                    "WARNING: server_uuid for instance has changed from its last known value. Use cluster.rescan() to update the metadata."
                ], 
                "memberRole": "SECONDARY", 
                "memberState": "OFFLINE", 
                "mode": "n/a", 
                "readReplicas": {}, 
                "role": "HA", 
                "status": "(MISSING)", 
                "version": "8.0.33"
            }, 
            "node3:3306": {
                "address": "node3:3306", 
                "instanceErrors": [
                    "WARNING: Instance is NOT a PRIMARY but super_read_only option is OFF.", 
                    "NOTE: group_replication is stopped.", 
                    "WARNING: server_uuid for instance has changed from its last known value. Use cluster.rescan() to update the metadata."
                ], 
                "memberRole": "SECONDARY", 
                "memberState": "OFFLINE", 
                "mode": "n/a", 
                "readReplicas": {}, 
                "role": "HA", 
                "status": "(MISSING)", 
                "version": "8.0.33"
            }
        }, 
        "topologyMode": "Single-Primary"
    }, 
    "groupInformationSourceMember": "node1:3306"
}

发现节点2、3虽然事实上已经不在了，但是还是被数据库记录到集群信息里。

注意第三行的报告， Use cluster.rescan() to update the metadata. 可以使用 cluster.rescan() 修复。

一定要多注意这些WARNING或者ERROR信息，真的很有用。

2.3.4剔除无效节点

MySQL localhost JS > cluster.removeInstance('mgr_user@node2:3306');

ERROR: node2:3306 is reachable but has state OFFLINE
To safely remove it from the cluster, it must be brought back ONLINE. If not possible, use the 'force' option to remove it anyway.
Do you want to continue anyway (only the instance metadata will be removed)? [y/N]: y
The instance will be removed from the InnoDB Cluster.
NOTE: The recovery user name for instance 'node2:3306' does not match the expected format for users created automatically by InnoDB Cluster. Skipping its removal.
NOTE: Transaction sync was skipped
NOTE: The instance 'node2:3306' is OFFLINE, Group Replication stop skipped.
ERROR: Instance 'node2:3306' failed to leave the cluster: node2:3306: Replica channel 'group_replication_applier' does not exist.
ERROR: MySQL Error 3074 (HY000): node2:3306: Replica channel 'group_replication_applier' does not exist.
Cluster.removeInstance: node2:3306: Replica channel 'group_replication_applier' does not exist. (MYSQLSH 3074)

剔除无效节点但是失败了，它提示我可以用force的方式强制干掉，可是没提供格式，陈辰老师说可以这样试试 , {force: true} ，但是最后也没找到正确格式。通过《MySQL DBA 精英实战课》的 13.3 InnoDB Cluster这个章节找到了 cluster.rescan(); 这个命令，和上边的warning信息一致。

cluster.rescan()

如果手动更改了实例的配置，或者实例退出集群之后，需要更新集群的元数据，则可以使用该命令。使用 rescan() 可以检测没有在元数据中注册的新活动实例并添加他们，或者删除在元数据中的过时实例。

2.3.5更新元数据

 MySQL  localhost  JS > cluster.rescan();
Rescanning the cluster...

Result of the rescanning operation for the 'Cluster01' cluster:
{
    "name": "Cluster01", 
    "newTopologyMode": null, 
    "newlyDiscoveredInstances": [], 
    "unavailableInstances": [
        {
            "host": "node2:3306", 
            "label": "node2:3306", 
            "member_id": "3be52ec1-0929-11ee-b03d-fa163ef86799"
        }, 
        {
            "host": "node3:3306", 
            "label": "node3:3306", 
            "member_id": "b828bf5b-0929-11ee-bb20-fa163ea274e9"
        }
    ], 
    "updatedInstances": []
}

The instance 'node2:3306' is no longer part of the cluster.
The instance is either offline or left the HA group. You can try to add it to the cluster again with the cluster.rejoinInstance('node2:3306') command or you can remove it from the cluster configuration.
Would you like to remove it from the cluster metadata? [Y/n]: y
Removing instance from the cluster metadata...
The instance 'node2:3306' was successfully removed from the cluster metadata.

The instance 'node3:3306' is no longer part of the cluster.
The instance is either offline or left the HA group. You can try to add it to the cluster again with the cluster.rejoinInstance('node3:3306') command or you can remove it from the cluster configuration.
Would you like to remove it from the cluster metadata? [Y/n]: y
Removing instance from the cluster metadata...
The instance 'node3:3306' was successfully removed from the cluster metadata.

Dropping unused recovery account: 'mysql_innodb_cluster_101'@'%'
Dropping unused recovery account: 'mysql_innodb_cluster_102'@'%'

碰到确定选项就输入 y，确认删除无效节点信息。

2.3.6添加节点2

cluster.addInstance('mgr_user@11.11.11.12:3306')

需要输入C，表示克隆

NOTE: The target instance 'mjndfev2:3306' has not been pre-provisioned (GTID set is empty). The Shell is unable to decide whether incremental state recovery can correctly provision it.
The safest and most convenient way to provision a new instance is through automatic clone provisioning, which will completely overwrite the state of 'mjndfev2:3306' with a physical snapshot from an existing cluster member. To use this method by default, set the 'recoveryMethod' option to 'clone'.
The incremental state recovery may be safely used if you are sure all updates ever executed in the cluster were done with GTIDs enabled, there are no purged transactions and the new instance contains the same GTID set as the cluster or a subset of it. To use this method by default, set the 'recoveryMethod' option to 'incremental'.
Please select a recovery method [C]lone/[I]ncremental recovery/[A]bort (default Clone): C

同步完成后会重启添加的节点2

NOTE: node2:3306 is shutting down...

同步成功完成的信息

* Waiting for server restart... ready
* node2:3306 has restarted, waiting for clone to finish...
** Stage RESTART: Completed
* Clone process has finished: 73.67 MB transferred in about 1 second (~73.67 MB/s)
State recovery already finished for 'node2:3306'
The instance 'node2:3306' was successfully added to the cluster.

这个是以前的同步信息，这次重新同步大概花了10分钟左右的时间，同步56G

查看集群状态

cluster.status();

添加成功了，但是有报错

 MySQL  localhost  JS > cluster.status();
{
    "clusterName": "Cluster01", 
    "defaultReplicaSet": {
        "name": "default", 
        "primary": "node1:3306", 
        "ssl": "REQUIRED", 
        "status": "OK_NO_TOLERANCE", 
        "statusText": "Cluster is NOT tolerant to any failures.", 
        "topology": {
            "node1:3306": {
                "address": "node1:3306", 
                "instanceErrors": [
                    "WARNING: Detected an unused recovery account: mysql_innodb_cluster_101. Use Cluster.rescan() to clean up."
                ], 
                "memberRole": "PRIMARY", 
                "mode": "R/W", 
                "readReplicas": {}, 
                "replicationLag": "applier_queue_applied", 
                "role": "HA", 
                "status": "ONLINE", 
                "version": "8.0.33"
            }, 
            "node2:3306": {
                "address": "node2:3306", 
                "instanceErrors": [
                    "WARNING: Instance is not managed by InnoDB cluster. Use cluster.rescan() to repair.", 
                    "ERROR: Invalid or missing information of Group Replication's network address in metadata. Use Cluster.rescan() to update the metadata."
                ], 
                "memberRole": "SECONDARY", 
                "mode": "R/O", 
                "readReplicas": {}, 
                "replicationLag": "applier_queue_applied", 
                "role": "HA", 
                "status": "ONLINE", 
                "version": "8.0.33"
            }
        }, 
        "topologyMode": "Single-Primary"
    }, 
    "groupInformationSourceMember": "node1:3306"
}

2.3.7添加节点3

cluster.addInstance('mgr_user@11.11.11.13:3306')

这个是成功爆出的信息

NOTE: node3:3306 is shutting down...

* Waiting for server restart... ready 
* node3:3306 has restarted, waiting for clone to finish...
** Stage RESTART: Completed
* Clone process has finished: 55.92 GB transferred in 1 hour 16 min 22 sec (12.20 MB/s)

State recovery already finished for 'node3:3306'

2.3.8查看集群信息

 MySQL  localhost  JS > cluster.status();

                 "instanceErrors": [
                    "WARNING: Instance is not managed by InnoDB cluster. Use cluster.rescan() to repair.", 
                    "ERROR: Invalid or missing information of Group Replication's network address in metadata. Use Cluster.rescan() to update the metadata."
                ],

发现节点2、3集群信息都有这个报错

2.3.9修复MGR集群

 MySQL  localhost  JS > cluster.rescan();
Rescanning the cluster...

Result of the rescanning operation for the 'Cluster01' cluster:
{
    "name": "Cluster01", 
    "newTopologyMode": null, 
    "newlyDiscoveredInstances": [
        {
            "host": "node3:3306", 
            "member_id": "b0fb50e9-0d02-11ef-adf7-fa163ea274e9", 
            "name": null, 
            "version": "8.0.33"
        }, 
        {
            "host": "node2:3306", 
            "member_id": "f0e7990b-0d00-11ef-90e8-fa163ef86799", 
            "name": null, 
            "version": "8.0.33"
        }
    ], 
    "unavailableInstances": [], 
    "updatedInstances": []
}

A new instance 'node3:3306' was discovered in the cluster.
Would you like to add it to the cluster metadata? [Y/n]: y
Adding instance to the cluster metadata...
The instance 'node3:3306' was successfully added to the cluster metadata.

A new instance 'node2:3306' was discovered in the cluster.
Would you like to add it to the cluster metadata? [Y/n]: y
Adding instance to the cluster metadata...
The instance 'node2:3306' was successfully added to the cluster metadata.

修复成功，约到需要输出的信息请输入y来确定

2.3.10检查集群状态

 MySQL  localhost  JS > cluster.status();
{
    "clusterName": "Cluster01", 
    "defaultReplicaSet": {
        "name": "default", 
        "primary": "node1:3306", 
        "ssl": "REQUIRED", 
        "status": "OK", 
        "statusText": "Cluster is ONLINE and can tolerate up to ONE failure.", 
        "topology": {
            "node1:3306": {
                "address": "node1:3306", 
                "memberRole": "PRIMARY", 
                "mode": "R/W", 
                "readReplicas": {}, 
                "replicationLag": "applier_queue_applied", 
                "role": "HA", 
                "status": "ONLINE", 
                "version": "8.0.33"
            }, 
            "node2:3306": {
                "address": "node2:3306", 
                "memberRole": "SECONDARY", 
                "mode": "R/O", 
                "readReplicas": {}, 
                "replicationLag": "applier_queue_applied", 
                "role": "HA", 
                "status": "ONLINE", 
                "version": "8.0.33"
            }, 
            "node3:3306": {
                "address": "node3:3306", 
                "memberRole": "SECONDARY", 
                "mode": "R/O", 
                "readReplicas": {}, 
                "replicationLag": "applier_queue_applied", 
                "role": "HA", 
                "status": "ONLINE", 
                "version": "8.0.33"
            }
        }, 
        "topologyMode": "Single-Primary"
    }, 
    "groupInformationSourceMember": "node1:3306"
}

修复成功

墨力计划

「喜欢这篇文章，您的关注和赞赏是给作者最好的鼓励」

关注作者