状态检测和故障切换

前言

  • keepalived虽然内置了配合LVS实现高可用的配置文件,但对其他服务并没有相应的配置,但这没关系,keepalived还支持通过script来实现对服务状态的检测和故障切换

原理

  • 在global配置段外定义脚本或者shell命令,然后再virtual_server配置段中调用脚本,最后根据脚本的执行结果(成功或失败) 来增加或减少当前的节点的优先级,因为keepalived是根据优先级来判断是否为主节点 所以当优先级低时则就成为备份节点 从而将VIP飘动到优先级高的主节点上 实现了对其他服务的高可用

相关配置

  • 参考配置文件:/usr/share/doc/keepalived/keepalived.conf.vrrp.localcheck

定义脚本

  • 在vrrp_script段配置
vrrp_script chk_nginx { # chk_nginx为自定义的vrrp脚本名称 后续调用需要此名称
       script "killall -0 sshd"        # 执行的脚本或命令
       interval 2                      # 监测的时间间隔 单位:秒 默认:1秒
       weight -4                       # 此值默认为0 范围:-253~253 ,如果此值为负数 则代表当上面的脚本或命令执行失败时($?结果为非0) 则将本节点的优先级减去定义的值,如果此值为正数 则代表当上面的脚本或命令执行成功时($?结果为0) 则将本节点的优先级加上定义的值
       fall 2                          # 脚本或命令执行几次失败后 将本节点标记为失败(进而进行减优先级操作)
       rise 2                          # 脚本或命令执行几次成功后 将本节点标记为成功(进而恢复节点)
       user USERNAME                   # 以什么用户身份执行脚本
}

调用脚本

vrrp_instance VI_1 {
...
    track_script {
       script_name
    }
}

范例:利用脚本实现主从切换

环境准备

IP VIP service hostname
10.0.0.18 10.0.0.100 Keepalived Keepalived1
10.0.0.28 10.0.0.100 Keepalived Keepalived2

Keepalived1配置

#定义子配置文件
[root@keepalived1 ~]# cat /etc/keepalived/keepalived.conf 
global_defs {
   router_id KA1
}

include /etc/keepalived/conf.d/*.conf

----------------------------------------------------------------------------

#添加子配置文件
[root@keepalived1 ~]# cat /etc/keepalived/conf.d/virtual_vip.conf 
vrrp_script check_test { #脚本名称
       script "/etc/keepalived/down.sh" #调用的脚本
       interval 2
       weight -30 #当执行失败后本节点权重+-30 也就等于减30了
       fall 2 #连续几次执行脚本失败就认为失败,然后执行权重-30
       rise 2
}

vrrp_instance WEB1 { #定义的虚拟路由器实例
    state MASTER
    interface eth0
    virtual_router_id 66
    priority 100 #权重100
    advert_int 1
    authentication {
        auth_type PASS
        auth_pass 666
    }

    virtual_ipaddress {
        10.0.0.100/24 dev eth0 label eth0:0
    }

    unicast_src_ip 10.0.0.18
    unicast_peer {
    10.0.0.28
    }

    track_script {
    	check_test #调用脚本
    }
}

Keepalived2配置

#定义子配置文件
[root@keepalived2 ~]# cat /etc/keepalived/keepalived.conf 
global_defs {
   router_id KA2
}

include /etc/keepalived/conf.d/*.conf

----------------------------------------------------------------------------

#添加子配置文件
[root@keepalived2 ~]# cat /etc/keepalived/conf.d/virtual_vip.conf 
vrrp_script check_test {
       script "/etc/keepalived/down.sh"
       interval 2
       weight -30
       fall 2
       rise 2
}

vrrp_instance WEB1 {
    state BACKUP
    interface eth0
    virtual_router_id 66
    priority 80 #权重80
    advert_int 1
    authentication {
        auth_type PASS
        auth_pass 666
    }

    virtual_ipaddress {
        10.0.0.100/24 dev eth0 label eth0:0
    }

    unicast_src_ip 10.0.0.28
    unicast_peer {
    10.0.0.18
    }

    track_script {
    	check_test
    }
}

测试脚本

  • 注意:修改检测脚本无需重启 keepalived 即可立即生效
[root@keepalived2 ~]# cat /etc/keepalived/down.sh
#!/bin/bash
[ ! -f /etc/keepalived/down ] #文件不存在则为真$?0 存在则为假$?非0

测试

  • 重启keepalived后进行测试
#因为keepalived1的优先级高 所以目前VIP在 keepalived1上
[root@keepalived1 ~]# hostname -I
10.0.0.18 10.0.0.100 
[root@keepalived2 ~]# hostname -I
10.0.0.28

#抓包查看
[root@web1 ~]# tcpdump -nn src host 10.0.0.18 and dst host 10.0.0.28
07:36:43.287823 IP 10.0.0.18 > 10.0.0.28: VRRPv2, Advertisement, vrid 66, prio 100, authtype simple, intvl 1s, length 20

#创建测试文件
[root@keepalived1 ~]# touch /etc/keepalived/down

#观察日志,可以看到通过监测脚本发现结果为假 所以进而执行减权重
Feb 21 00:18:58 keepalived1 Keepalived_vrrp[9509]: Script `check_test` now returning 1
Feb 21 00:19:00 keepalived1 Keepalived_vrrp[9509]: VRRP_Script(check_test) failed (exited with status 1)
Feb 21 00:19:00 keepalived1 Keepalived_vrrp[9509]: (WEB1) Changing effective priority from 100 to 70
Feb 21 00:19:04 keepalived1 Keepalived_vrrp[9509]: (WEB1) Master received advert from 10.0.0.28 with higher priority 80, ours 70
Feb 21 00:19:04 keepalived1 Keepalived_vrrp[9509]: (WEB1) Entering BACKUP STATE
Feb 21 00:19:04 keepalived1 Keepalived_vrrp[9509]: (WEB1) removing VIPs.

#观察VIP飘动情况
[root@keepalived1 ~]# hostname -I
10.0.0.18 
[root@keepalived2 ~]# hostname -I
10.0.0.28 10.0.0.100

#抓包查看,注意这个时候应该反方向抓包了,因为使用的是单播模式
[root@web1 ~]# tcpdump -nn src host 10.0.0.28 and dst host 10.0.0.18
08:19:13.013447 IP 10.0.0.28 > 10.0.0.18: VRRPv2, Advertisement, vrid 66, prio 80, authtype simple, intvl 1s, length 20

#删除测试文件
[root@keepalived1 ~]# rm -f /etc/keepalived/down

#观察日志,可以看到检测脚本发现执行结果为0 进而恢复权重 因此夺回VIP
Feb 21 00:22:01 keepalived1 Keepalived_vrrp[9509]: Script `check_test` now returning 0
Feb 21 00:22:03 keepalived1 Keepalived_vrrp[9509]: VRRP_Script(check_test) succeeded
Feb 21 00:22:03 keepalived1 Keepalived_vrrp[9509]: (WEB1) Changing effective priority from 70 to 100
Feb 21 00:22:03 keepalived1 Keepalived_vrrp[9509]: (WEB1) received lower priority (80) advert from 10.0.0.28 - discarding
Feb 21 00:22:04 keepalived1 Keepalived_vrrp[9509]: (WEB1) received lower priority (80) advert from 10.0.0.28 - discarding
Feb 21 00:22:05 keepalived1 Keepalived_vrrp[9509]: (WEB1) received lower priority (80) advert from 10.0.0.28 - discarding
Feb 21 00:22:06 keepalived1 Keepalived_vrrp[9509]: (WEB1) Receive advertisement timeout
Feb 21 00:22:06 keepalived1 Keepalived_vrrp[9509]: (WEB1) Entering MASTER STATE
Feb 21 00:22:06 keepalived1 Keepalived_vrrp[9509]: (WEB1) setting VIPs.
Feb 21 00:22:06 keepalived1 Keepalived_vrrp[9509]: Sending gratuitous ARP on eth0 for 10.0.0.100
Feb 21 00:22:06 keepalived1 Keepalived_vrrp[9509]: (WEB1) Sending/queueing gratuitous ARPs on eth0 for 10.0.0.100
Feb 21 00:22:06 keepalived1 Keepalived_vrrp[9509]: Sending gratuitous ARP on eth0 for 10.0.0.100

#观察VIP飘动情况
[root@keepalived1 ~]# hostname -I
10.0.0.18 10.0.0.100
[root@keepalived2 ~]# hostname -I
10.0.0.28

#抓包查看
[root@web1 ~]# tcpdump -nn src host 10.0.0.18 and dst host 10.0.0.28
08:23:58.380622 IP 10.0.0.18 > 10.0.0.28: VRRPv2, Advertisement, vrid 66, prio 100, authtype simple, intvl 1s, length 20

故障检测脚本最佳实践

前言

  • keepalived 支持脚本来进行故障检测 从而实现故障切换和恢复 那么脚本应该怎么编写呢?我们可以从以下几个角度来判断service是否正常工作
  • 如:进程是否存活、主页面是否能成功访问并返回200状态码、服务提供的状态信息展示等多方维度来进行判断
  • 检测失败后,还可以配合重启服务等操作来实现自行恢复,如果能恢复 那么会减少节点切换来带的时间消耗
  • 下面举几个例子可以作为参考:

通用进程检测

方案一

  • **0信号说明:**对进程进行健康性检查(健康则$?结果为0,反之则为1),但结果并不严谨 因为进程只要存在 它就认为进程是健康的 如:进程已经成为僵尸态 而$?的返回值依旧为0
#kilall默认未安装,此工具来自psmisc包
killall -0 process_name  \
> ||
> systemctl restart process_name

方案二

ps aux | grep process_name | grep -v grep &> /dev/null \
> ||
> systemctl restart process_name

方案二

systemctl is-active nginx.service &> /dev/null \
> ||
> systemctl restart process_name

MySQL健康性检测

方案一

mysqladmin -uroot -p123456 ping &> /dev/null \
> ||
> systemctl restart mysql

方案二

mysql -uroot -p123456 -e 'status' &> /dev/null \
> ||
> systemctl restart mysql

MySQL主从复制检测

[root@keepalived1 ~]# vim /etc/keepalived/check_mysql.sh
#!/bin/bash

slave_is="$(mysql -uroot -p123456 -e 'show slave status\G' | grep "Slave_.*_Running:" | awk '{print $2}')"

if [ "${slave_is[0]}" = "Yes" -a "${slave_is[1]}" = "Yes"];then
    exit 0
else
    exit 1
fi    

Haproxy健康性检查

[root@haproxy-ka1 ~]# vim /etc/keepalived/check_haproxy.sh
ps aux | grep process_name | grep -v grep &> /dev/null \
||
systemctl is-active nginx.service &> /dev/null \
||
systemctl restart haproxy

故障检测脚本避坑指南

  • 这是一起在使用 haproxy+keepalived 时遇到的坑
  • 来自:https://github.com/kubernetes/kubeadm/blob/main/docs/ha-considerations.md
  • 先看官方提供的脚本:
#!/bin/sh

errorExit() {
    echo "*** $*" 1>&2
    exit 1
}

curl --silent --max-time 2 --insecure https://localhost:${APISERVER_DEST_PORT}/ -o /dev/null || errorExit "Error GET https://localhost:${APISERVER_DEST_PORT}/"
if ip addr | grep -q ${APISERVER_VIP}; then
    curl --silent --max-time 2 --insecure https://${APISERVER_VIP}:${APISERVER_DEST_PORT}/ -o /dev/null || errorExit "Error GET https://${APISERVER_VIP}:${APISERVER_DEST_PORT}/"
fi
  • 因为 keepalived 不需要标准输出,只需要 $? 的值是 0 还是非 0,所以脚本可以简化成这样:
  • 当然还需要添加所需的变量
#!/bin/sh

APISERVER_VIP='10.0.0.123'
APISERVER_DEST_PORT='6443'

#判断一
curl --silent --max-time 2 --insecure https://localhost:${APISERVER_DEST_PORT}/ -o /dev/null

#判断二
if ip addr | grep -q ${APISERVER_VIP}; then
    curl --silent --max-time 2 --insecure https://${APISERVER_VIP}:${APISERVER_DEST_PORT}/ -o /dev/null
fi
  • 判断一是没问题的,只要可以成功通过本机的 haproxy 访问 apiserver,那么 $? 的返回值就会为0
  • 判断二是有问题的,假设 MSTER 节点的 haproxy 无法访问,判断一 $? 值将返回非0,这时会进行减权重操作,进而 VIP 会飘到 BACKUP上。但这时因为 VIP的6443 端口又可以正常访问了,所以MASTER 脚本的判断二又可以判断成功,进而 MSTER 节点又会将权重增加,这时 VIP 又会飘到 MASTER 节点上,但 MASTER 节点的 haproxy 已经无法访问,所以权重又会减少,VIP 又会飘到 BACKUP 上。所以最终导致的结果就是 VIP 在 MASTER 和 BACKUP 节点上不断来回飘
  • 正确写法:(有待完善)
#!/bin/bash
#
#********************************************************************
#Author:	     	xiangzheng
#QQ: 			    767483070
#Date: 		     	2022-07-03
#FileName:		    1.sh
#URL: 		    	https://www.xiangzheng.vip
#Email: 		    rootroot25@163.com
#Description:		The test script
#Copyright (C): 	2022 All rights reserved
#********************************************************************

APISERVER_DEST_PORT='6443'

curl --silent --max-time 2 --insecure https://localhost:${APISERVER_DEST_PORT}/ -o /dev/null

总结

  • 他人写的脚本慎用!还是借鉴或参考比较好