一、Watchdog工作原理
1.1 基础架构
plaintext
Watchdog组件:
类型功能适用场景
硬件看门狗物理重启系统级死锁
软件看门狗进程监控应用级故障
系统看门狗资源监控性能问题
工作流程:
1.初始化计时器
2.周期性喂狗
3.超时未喂狗
4.触发重启动作
1.2 硬件支持检查
bash
# 检查硬件看门狗支持
lsmod | grep watchdog
modprobe iTCO_wdt # Intel TCO watchdog
modprobe ipmi_watchdog # IPMI watchdog
# 检查watchdog设备
ls -l /dev/watchdog*
# 查看当前状态
cat /sys/class/watchdog/watchdog0/status
二、基础配置实现
2.1 内核配置
python
def check_kernel_config():
"""检查内核配置"""
required_configs ={
'CONFIG_WATCHDOG':'y',
'CONFIG_WATCHDOG_CORE':'y',
'CONFIG_SOFT_WATCHDOG':'m',
'CONFIG_WATCHDOG_NOWAYOUT':'y'
}
with open('/boot/config-$(uname -r)')as f:
kernel_config = f.read()
return verify_configs(kernel_config, required_configs)
2.2 服务配置
ini
# /etc/watchdog.conf配置
watchdog-device =/dev/watchdog
watchdog-timeout =60
interval =10
# 监控项配置
file =/var/log/syslog
change =1800
temperature-sensor =/sys/class/thermal/thermal_zone0/temp
max-temperature =90
min-memory =10000
allocatable-memory =5%
ping =8.8.8.8
ping-count =3
interface= eth0
admin = root
realtime = yes
priority =1
三、监控策略实现
3.1 系统资源监控
python
classSystemWatchdog:
def __init__(self):
self.monitors ={
'cpu':self.monitor_cpu,
'memory':self.monitor_memory,
'disk':self.monitor_disk,
'network':self.monitor_network
}
def monitor_cpu(self):
"""CPU监控"""
thresholds ={
'load_avg':0.9,
'iowait':30,
'steal':5
}
metrics = get_cpu_metrics()
return check_thresholds(metrics, thresholds)
def monitor_memory(self):
"""内存监控"""
thresholds ={
'available':'10%',
'swap_used':'80%',
'page_faults':1000
}
metrics = get_memory_metrics()
return check_thresholds(metrics, thresholds)
3.2 进程监控
python
def monitor_critical_processes():
"""关键进程监控"""
processes ={
'nginx':{
'pid_file':'/var/run/nginx.pid',
'port':80,
'max_restarts':3
},
'mysql':{
'pid_file':'/var/run/mysqld/mysqld.pid',
'port':3306,
'max_restarts':2
}
}
for name, config in processes.items():
status = check_process(config)
ifnot status.is_running:
handle_process_failure(name, config)
四、自动恢复机制
4.1 故障恢复策略
python
classRecoveryManager:
def __init__(self):
self.recovery_actions ={
'process':self.recover_process,
'service':self.recover_service,
'system':self.recover_system
}
def recover_process(self, process_info):
"""进程级恢复"""
steps =[
('stop_process',self.stop_process),
('cleanup_resources',self.cleanup_resources),
('start_process',self.start_process),
('verify_status',self.verify_status)
]
for step_name, step_func in steps:
try:
step_func(process_info)
exceptExceptionas e:
self.handle_recovery_failure(step_name, e)
returnFalse
returnTrue
def recover_system(self, failure_info):
"""系统级恢复"""
ifself.is_kernel_issue(failure_info):
self.trigger_reboot()
else:
self.restart_services()
4.2 日志记录
python
def setup_logging():
"""配置日志系统"""
log_config ={
'version':1,
'handlers':{
'file':{
'class':'logging.handlers.RotatingFileHandler',
'filename':'/var/log/watchdog.log',
'maxBytes':10485760,
'backupCount':5
},
'syslog':{
'class':'logging.handlers.SysLogHandler',
'address':'/dev/log',
'facility':'local0'
}
},
'root':{
'level':'INFO',
'handlers':['file','syslog']
}
}
return configure_logging(log_config)
五、实际应用案例
5.1 Web服务器监控
python
classWebServerWatchdog:
def __init__(self):
self.checks ={
'http_check':self.check_http,
'cert_check':self.check_ssl_cert,
'backend_check':self.check_backend
}
def check_http(self):
"""HTTP服务检查"""
endpoints =[
{'url':'/','expect_code':200},
{'url':'/health','expect_code':200},
{'url':'/api/status','expect_code':200}
]
for endpoint in endpoints:
response =self.http_get(endpoint['url'])
if response.status_code != endpoint['expect_code']:
self.handle_failure('http', endpoint)
5.2 数据库监控
python
classDatabaseWatchdog:
def monitor_database(self):
"""数据库监控"""
checks ={
'connection':self.check_connection,
'replication':self.check_replication,
'slow_queries':self.check_slow_queries,
'connections':self.check_max_connections
}
results ={}
for check_name, check_func in checks.items():
try:
results[check_name]= check_func()
exceptExceptionas e:
self.handle_check_failure(check_name, e)
return analyze_results(results)