Files
linkmaster-node/check-heartbeat.sh
yoyo d8ea772c24 feat: 添加日志文件输出功能和心跳故障排查工具
- 新增日志文件输出功能,支持配置日志文件路径和级别
- 添加心跳故障排查脚本 check-heartbeat.sh
- 支持通过环境变量 LOG_FILE 设置日志文件路径
- 日志自动创建目录,支持相对路径和绝对路径
- 优化日志初始化逻辑,支持直接写入文件
- 改进配置加载,支持日志配置项
- 完善文档,添加故障排查章节和日志功能说明
- 更新版本号至 v1.1.0
2025-12-07 16:37:03 +08:00

513 lines
17 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# ============================================
# LinkMaster 节点心跳故障排查脚本
# 用途:诊断节点心跳同步问题
# ============================================
set -e
# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color
# 脚本目录
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
# 配置
BINARY_NAME="agent"
LOG_FILE="node.log"
PID_FILE="node.pid"
CONFIG_FILE="${CONFIG_PATH:-config.yaml}"
# 检查结果
ISSUES=0
WARNINGS=0
# 打印分隔线
print_separator() {
echo -e "${CYAN}========================================${NC}"
}
# 打印检查项标题
print_check_title() {
echo -e "\n${BLUE}$1${NC}"
}
# 打印成功信息
print_success() {
echo -e "${GREEN}$1${NC}"
}
# 打印警告信息
print_warning() {
echo -e "${YELLOW}$1${NC}"
((WARNINGS++))
}
# 打印错误信息
print_error() {
echo -e "${RED}$1${NC}"
((ISSUES++))
}
# 打印信息
print_info() {
echo -e "${CYAN} $1${NC}"
}
# 获取PID
get_pid() {
if [ -f "$PID_FILE" ]; then
PID=$(cat "$PID_FILE")
if ps -p "$PID" > /dev/null 2>&1; then
echo "$PID"
else
rm -f "$PID_FILE"
echo ""
fi
else
echo ""
fi
}
# 1. 检查进程状态
check_process() {
print_check_title "检查进程状态"
PID=$(get_pid)
if [ -z "$PID" ]; then
print_error "节点进程未运行"
print_info "请使用 ./run.sh start 启动服务"
return 1
else
print_success "节点进程正在运行 (PID: $PID)"
# 检查进程运行时间
if command -v ps > /dev/null 2>&1; then
RUNTIME=$(ps -o etime= -p "$PID" 2>/dev/null | tr -d ' ')
if [ -n "$RUNTIME" ]; then
print_info "进程运行时间: $RUNTIME"
fi
fi
# 检查进程资源使用
if command -v ps > /dev/null 2>&1; then
CPU_MEM=$(ps -o %cpu,%mem= -p "$PID" 2>/dev/null | tr -d ' ')
if [ -n "$CPU_MEM" ]; then
print_info "CPU/内存使用: $CPU_MEM"
fi
fi
return 0
fi
}
# 2. 检查配置文件
check_config() {
print_check_title "检查配置文件"
if [ ! -f "$CONFIG_FILE" ]; then
print_warning "配置文件不存在: $CONFIG_FILE"
print_info "将使用环境变量和默认配置"
# 检查环境变量
if [ -n "$BACKEND_URL" ]; then
print_info "使用环境变量 BACKEND_URL: $BACKEND_URL"
else
print_warning "未设置 BACKEND_URL 环境变量,将使用默认值: http://localhost:8080"
fi
return 0
fi
print_success "配置文件存在: $CONFIG_FILE"
# 检查配置文件内容
if command -v yq > /dev/null 2>&1; then
BACKEND_URL_FROM_CONFIG=$(yq eval '.backend.url' "$CONFIG_FILE" 2>/dev/null || echo "")
HEARTBEAT_INTERVAL=$(yq eval '.heartbeat.interval' "$CONFIG_FILE" 2>/dev/null || echo "")
NODE_ID=$(yq eval '.node.id' "$CONFIG_FILE" 2>/dev/null || echo "")
NODE_IP=$(yq eval '.node.ip' "$CONFIG_FILE" 2>/dev/null || echo "")
else
# 使用 grep 和 sed 简单解析
BACKEND_URL_FROM_CONFIG=$(grep -E "^\s*url:" "$CONFIG_FILE" | head -1 | sed 's/.*url:\s*//' | tr -d '"' | tr -d "'" || echo "")
HEARTBEAT_INTERVAL=$(grep -E "^\s*interval:" "$CONFIG_FILE" | head -1 | sed 's/.*interval:\s*//' | tr -d '"' | tr -d "'" || echo "")
NODE_ID=$(grep -E "^\s*id:" "$CONFIG_FILE" | head -1 | sed 's/.*id:\s*//' | tr -d '"' | tr -d "'" || echo "")
NODE_IP=$(grep -E "^\s*ip:" "$CONFIG_FILE" | head -1 | sed 's/.*ip:\s*//' | tr -d '"' | tr -d "'" || echo "")
fi
# 确定使用的后端URL
if [ -n "$BACKEND_URL" ]; then
FINAL_BACKEND_URL="$BACKEND_URL"
print_info "使用环境变量 BACKEND_URL: $FINAL_BACKEND_URL"
elif [ -n "$BACKEND_URL_FROM_CONFIG" ]; then
FINAL_BACKEND_URL="$BACKEND_URL_FROM_CONFIG"
print_info "使用配置文件中的后端URL: $FINAL_BACKEND_URL"
else
FINAL_BACKEND_URL="http://localhost:8080"
print_warning "未找到后端URL配置使用默认值: $FINAL_BACKEND_URL"
fi
if [ -n "$HEARTBEAT_INTERVAL" ]; then
print_info "心跳间隔: ${HEARTBEAT_INTERVAL}"
else
print_info "心跳间隔: 60秒 (默认值)"
fi
if [ -n "$NODE_ID" ] && [ "$NODE_ID" != "0" ] && [ "$NODE_ID" != "null" ]; then
print_success "节点ID已配置: $NODE_ID"
else
print_warning "节点ID未配置或为0将在首次心跳时获取"
fi
if [ -n "$NODE_IP" ] && [ "$NODE_IP" != "null" ]; then
print_success "节点IP已配置: $NODE_IP"
else
print_warning "节点IP未配置将在首次心跳时获取"
fi
export FINAL_BACKEND_URL
}
# 3. 检查网络连接
check_network() {
print_check_title "检查网络连接"
if [ -z "$FINAL_BACKEND_URL" ]; then
print_error "无法确定后端URL跳过网络检查"
return 1
fi
# 提取主机和端口
BACKEND_HOST=$(echo "$FINAL_BACKEND_URL" | sed -E 's|https?://||' | cut -d'/' -f1 | cut -d':' -f1)
BACKEND_PORT=$(echo "$FINAL_BACKEND_URL" | sed -E 's|https?://||' | cut -d'/' -f1 | cut -d':' -f2)
if [ -z "$BACKEND_PORT" ]; then
if echo "$FINAL_BACKEND_URL" | grep -q "https://"; then
BACKEND_PORT=443
else
BACKEND_PORT=80
fi
fi
print_info "后端地址: $BACKEND_HOST:$BACKEND_PORT"
# 检查DNS解析
if command -v nslookup > /dev/null 2>&1 || command -v host > /dev/null 2>&1; then
if command -v nslookup > /dev/null 2>&1; then
if nslookup "$BACKEND_HOST" > /dev/null 2>&1; then
print_success "DNS解析成功: $BACKEND_HOST"
else
print_error "DNS解析失败: $BACKEND_HOST"
return 1
fi
elif command -v host > /dev/null 2>&1; then
if host "$BACKEND_HOST" > /dev/null 2>&1; then
print_success "DNS解析成功: $BACKEND_HOST"
else
print_error "DNS解析失败: $BACKEND_HOST"
return 1
fi
fi
fi
# 检查端口连通性
if command -v nc > /dev/null 2>&1; then
if nc -z -w 3 "$BACKEND_HOST" "$BACKEND_PORT" 2>/dev/null; then
print_success "端口连通性检查通过: $BACKEND_HOST:$BACKEND_PORT"
else
print_error "端口无法连接: $BACKEND_HOST:$BACKEND_PORT"
print_info "可能原因: 防火墙阻止、后端服务未启动、网络不通"
return 1
fi
elif command -v timeout > /dev/null 2>&1 && command -v bash > /dev/null 2>&1; then
# 使用 bash 内置的 TCP 连接测试
if timeout 3 bash -c "echo > /dev/tcp/$BACKEND_HOST/$BACKEND_PORT" 2>/dev/null; then
print_success "端口连通性检查通过: $BACKEND_HOST:$BACKEND_PORT"
else
print_error "端口无法连接: $BACKEND_HOST:$BACKEND_PORT"
print_info "可能原因: 防火墙阻止、后端服务未启动、网络不通"
return 1
fi
else
print_warning "无法检查端口连通性(需要 nc 或 timeout 命令)"
fi
# 检查HTTP连接
HEARTBEAT_URL="${FINAL_BACKEND_URL%/}/api/node/heartbeat"
print_info "测试心跳接口: $HEARTBEAT_URL"
if command -v curl > /dev/null 2>&1; then
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 --max-time 10 \
-X POST \
-H "Content-Type: application/x-www-form-urlencoded" \
-d "type=pingServer" \
"$HEARTBEAT_URL" 2>/dev/null || echo "000")
if [ "$HTTP_CODE" = "200" ]; then
print_success "心跳接口响应正常 (HTTP 200)"
elif [ "$HTTP_CODE" = "000" ]; then
print_error "无法连接到心跳接口"
print_info "可能原因: 网络不通、后端服务未启动、防火墙阻止"
return 1
else
print_warning "心跳接口返回异常状态码: HTTP $HTTP_CODE"
print_info "这可能是正常的,取决于后端实现"
fi
elif command -v wget > /dev/null 2>&1; then
HTTP_CODE=$(wget --spider --server-response --timeout=5 --tries=1 \
--post-data="type=pingServer" \
--header="Content-Type: application/x-www-form-urlencoded" \
"$HEARTBEAT_URL" 2>&1 | grep -E "HTTP/" | tail -1 | awk '{print $2}' || echo "000")
if [ "$HTTP_CODE" = "200" ]; then
print_success "心跳接口响应正常 (HTTP 200)"
elif [ "$HTTP_CODE" = "000" ]; then
print_error "无法连接到心跳接口"
return 1
else
print_warning "心跳接口返回异常状态码: HTTP $HTTP_CODE"
fi
else
print_warning "无法测试HTTP连接需要 curl 或 wget 命令)"
fi
return 0
}
# 4. 检查日志
check_logs() {
print_check_title "检查日志文件"
if [ ! -f "$LOG_FILE" ]; then
print_warning "日志文件不存在: $LOG_FILE"
print_info "如果服务刚启动,日志文件可能还未创建"
return 0
fi
print_success "日志文件存在: $LOG_FILE"
# 检查日志文件大小
LOG_SIZE=$(stat -f%z "$LOG_FILE" 2>/dev/null || stat -c%s "$LOG_FILE" 2>/dev/null || echo "0")
if [ "$LOG_SIZE" -gt 10485760 ]; then
print_warning "日志文件较大: $(($LOG_SIZE / 1024 / 1024))MB"
fi
# 检查最近的心跳记录
print_info "查找最近的心跳记录..."
HEARTBEAT_SUCCESS=$(grep -i "心跳发送成功\|heartbeat.*success\|心跳响应" "$LOG_FILE" 2>/dev/null | tail -5 || true)
HEARTBEAT_FAILED=$(grep -i "心跳发送失败\|heartbeat.*fail\|发送心跳失败" "$LOG_FILE" 2>/dev/null | tail -5 || true)
HEARTBEAT_ERROR=$(grep -i "error.*heartbeat\|心跳.*error" "$LOG_FILE" 2>/dev/null | tail -5 || true)
if [ -n "$HEARTBEAT_SUCCESS" ]; then
echo -e "${GREEN}最近成功的心跳记录:${NC}"
echo "$HEARTBEAT_SUCCESS" | while IFS= read -r line; do
echo " $line"
done
fi
if [ -n "$HEARTBEAT_FAILED" ]; then
echo -e "${YELLOW}最近失败的心跳记录:${NC}"
echo "$HEARTBEAT_FAILED" | while IFS= read -r line; do
echo " $line"
done
((WARNINGS++))
fi
if [ -n "$HEARTBEAT_ERROR" ]; then
echo -e "${RED}最近的心跳错误记录:${NC}"
echo "$HEARTBEAT_ERROR" | while IFS= read -r line; do
echo " $line"
done
((ISSUES++))
fi
# 检查最近的错误
RECENT_ERRORS=$(grep -i "error\|fail\|panic" "$LOG_FILE" 2>/dev/null | tail -10 || true)
if [ -n "$RECENT_ERRORS" ]; then
echo -e "${YELLOW}最近的错误记录最后10条:${NC}"
echo "$RECENT_ERRORS" | while IFS= read -r line; do
echo " $line"
done
fi
# 检查最后的心跳时间
LAST_HEARTBEAT=$(grep -i "心跳" "$LOG_FILE" 2>/dev/null | tail -1 || true)
if [ -n "$LAST_HEARTBEAT" ]; then
print_info "最后的心跳日志: $LAST_HEARTBEAT"
else
print_warning "日志中未找到心跳记录"
fi
}
# 5. 手动测试心跳
test_heartbeat() {
print_check_title "手动测试心跳发送"
if [ -z "$FINAL_BACKEND_URL" ]; then
print_error "无法确定后端URL跳过心跳测试"
return 1
fi
HEARTBEAT_URL="${FINAL_BACKEND_URL%/}/api/node/heartbeat"
print_info "发送测试心跳到: $HEARTBEAT_URL"
if command -v curl > /dev/null 2>&1; then
RESPONSE=$(curl -s -w "\n%{http_code}" --connect-timeout 10 --max-time 15 \
-X POST \
-H "Content-Type: application/x-www-form-urlencoded" \
-d "type=pingServer" \
"$HEARTBEAT_URL" 2>&1)
HTTP_CODE=$(echo "$RESPONSE" | tail -1)
BODY=$(echo "$RESPONSE" | sed '$d')
if [ "$HTTP_CODE" = "200" ]; then
print_success "心跳发送成功 (HTTP 200)"
if [ -n "$BODY" ]; then
print_info "响应内容: $BODY"
# 尝试解析JSON响应
if echo "$BODY" | grep -q "node_id\|node_ip"; then
print_success "响应包含节点信息"
echo "$BODY" | grep -o '"node_id":[0-9]*\|"node_ip":"[^"]*"' 2>/dev/null || true
fi
fi
else
print_error "心跳发送失败 (HTTP $HTTP_CODE)"
if [ -n "$BODY" ]; then
print_info "响应内容: $BODY"
fi
return 1
fi
elif command -v wget > /dev/null 2>&1; then
RESPONSE=$(wget -qO- --post-data="type=pingServer" \
--header="Content-Type: application/x-www-form-urlencoded" \
--timeout=15 \
"$HEARTBEAT_URL" 2>&1)
if [ $? -eq 0 ]; then
print_success "心跳发送成功"
if [ -n "$RESPONSE" ]; then
print_info "响应内容: $RESPONSE"
fi
else
print_error "心跳发送失败"
return 1
fi
else
print_warning "无法测试心跳(需要 curl 或 wget 命令)"
return 1
fi
return 0
}
# 6. 检查系统资源
check_resources() {
print_check_title "检查系统资源"
# 检查磁盘空间
if command -v df > /dev/null 2>&1; then
DISK_USAGE=$(df -h . | tail -1 | awk '{print $5}' | sed 's/%//')
if [ "$DISK_USAGE" -gt 90 ]; then
print_error "磁盘空间不足: ${DISK_USAGE}%"
elif [ "$DISK_USAGE" -gt 80 ]; then
print_warning "磁盘空间紧张: ${DISK_USAGE}%"
else
print_success "磁盘空间充足: ${DISK_USAGE}%"
fi
fi
# 检查内存
if command -v free > /dev/null 2>&1; then
MEM_INFO=$(free -m | grep Mem)
MEM_TOTAL=$(echo "$MEM_INFO" | awk '{print $2}')
MEM_AVAIL=$(echo "$MEM_INFO" | awk '{print $7}')
if [ -z "$MEM_AVAIL" ]; then
MEM_AVAIL=$(echo "$MEM_INFO" | awk '{print $4}')
fi
if [ -n "$MEM_TOTAL" ] && [ -n "$MEM_AVAIL" ]; then
MEM_PERCENT=$((MEM_AVAIL * 100 / MEM_TOTAL))
if [ "$MEM_PERCENT" -lt 10 ]; then
print_error "可用内存不足: ${MEM_AVAIL}MB / ${MEM_TOTAL}MB (${MEM_PERCENT}%)"
elif [ "$MEM_PERCENT" -lt 20 ]; then
print_warning "可用内存紧张: ${MEM_AVAIL}MB / ${MEM_TOTAL}MB (${MEM_PERCENT}%)"
else
print_success "内存充足: ${MEM_AVAIL}MB / ${MEM_TOTAL}MB (${MEM_PERCENT}%)"
fi
fi
fi
}
# 主函数
main() {
echo -e "${CYAN}"
echo "========================================"
echo " LinkMaster 节点心跳故障排查工具"
echo "========================================"
echo -e "${NC}"
# 执行各项检查
check_process
PROCESS_OK=$?
check_config
if [ $PROCESS_OK -eq 0 ]; then
check_network
NETWORK_OK=$?
check_logs
if [ $NETWORK_OK -eq 0 ]; then
echo ""
read -p "是否执行手动心跳测试? (y/N): " -n 1 -r
echo ""
if [[ $REPLY =~ ^[Yy]$ ]]; then
test_heartbeat
fi
fi
fi
check_resources
# 总结
print_separator
echo -e "\n${BLUE}排查总结:${NC}"
if [ $ISSUES -eq 0 ] && [ $WARNINGS -eq 0 ]; then
echo -e "${GREEN}✓ 未发现明显问题${NC}"
echo -e "${CYAN}如果心跳仍然无法同步,请检查:${NC}"
echo " 1. 后端服务是否正常运行"
echo " 2. 后端数据库是否正常"
echo " 3. 防火墙规则是否正确配置"
echo " 4. 查看完整日志: ./run.sh logs-all"
else
if [ $ISSUES -gt 0 ]; then
echo -e "${RED}发现 $ISSUES 个严重问题${NC}"
fi
if [ $WARNINGS -gt 0 ]; then
echo -e "${YELLOW}发现 $WARNINGS 个警告${NC}"
fi
echo -e "\n${CYAN}建议操作:${NC}"
echo " 1. 根据上述检查结果修复问题"
echo " 2. 重启服务: ./run.sh restart"
echo " 3. 查看实时日志: ./run.sh logs"
echo " 4. 查看完整日志: ./run.sh logs-all"
fi
print_separator
}
# 运行主函数
main