#!/bin/bash # ============================================ # LinkMaster 节点心跳故障排查脚本 # 用途:诊断节点心跳同步问题 # ============================================ set -e # 颜色输出 RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' CYAN='\033[0;36m' NC='\033[0m' # No Color # 脚本目录 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "$SCRIPT_DIR" # 配置 BINARY_NAME="agent" LOG_FILE="node.log" PID_FILE="node.pid" CONFIG_FILE="${CONFIG_PATH:-config.yaml}" # 检查结果 ISSUES=0 WARNINGS=0 # 打印分隔线 print_separator() { echo -e "${CYAN}========================================${NC}" } # 打印检查项标题 print_check_title() { echo -e "\n${BLUE}▶ $1${NC}" } # 打印成功信息 print_success() { echo -e "${GREEN}✓ $1${NC}" } # 打印警告信息 print_warning() { echo -e "${YELLOW}⚠ $1${NC}" ((WARNINGS++)) } # 打印错误信息 print_error() { echo -e "${RED}✗ $1${NC}" ((ISSUES++)) } # 打印信息 print_info() { echo -e "${CYAN}ℹ $1${NC}" } # 获取PID get_pid() { if [ -f "$PID_FILE" ]; then PID=$(cat "$PID_FILE") if ps -p "$PID" > /dev/null 2>&1; then echo "$PID" else rm -f "$PID_FILE" echo "" fi else echo "" fi } # 1. 检查进程状态 check_process() { print_check_title "检查进程状态" PID=$(get_pid) if [ -z "$PID" ]; then print_error "节点进程未运行" print_info "请使用 ./run.sh start 启动服务" return 1 else print_success "节点进程正在运行 (PID: $PID)" # 检查进程运行时间 if command -v ps > /dev/null 2>&1; then RUNTIME=$(ps -o etime= -p "$PID" 2>/dev/null | tr -d ' ') if [ -n "$RUNTIME" ]; then print_info "进程运行时间: $RUNTIME" fi fi # 检查进程资源使用 if command -v ps > /dev/null 2>&1; then CPU_MEM=$(ps -o %cpu,%mem= -p "$PID" 2>/dev/null | tr -d ' ') if [ -n "$CPU_MEM" ]; then print_info "CPU/内存使用: $CPU_MEM" fi fi return 0 fi } # 2. 检查配置文件 check_config() { print_check_title "检查配置文件" if [ ! -f "$CONFIG_FILE" ]; then print_warning "配置文件不存在: $CONFIG_FILE" print_info "将使用环境变量和默认配置" # 检查环境变量 if [ -n "$BACKEND_URL" ]; then print_info "使用环境变量 BACKEND_URL: $BACKEND_URL" else print_warning "未设置 BACKEND_URL 环境变量,将使用默认值: http://localhost:8080" fi return 0 fi print_success "配置文件存在: $CONFIG_FILE" # 检查配置文件内容 if command -v yq > /dev/null 2>&1; then BACKEND_URL_FROM_CONFIG=$(yq eval '.backend.url' "$CONFIG_FILE" 2>/dev/null || echo "") HEARTBEAT_INTERVAL=$(yq eval '.heartbeat.interval' "$CONFIG_FILE" 2>/dev/null || echo "") NODE_ID=$(yq eval '.node.id' "$CONFIG_FILE" 2>/dev/null || echo "") NODE_IP=$(yq eval '.node.ip' "$CONFIG_FILE" 2>/dev/null || echo "") else # 使用 grep 和 sed 简单解析 BACKEND_URL_FROM_CONFIG=$(grep -E "^\s*url:" "$CONFIG_FILE" | head -1 | sed 's/.*url:\s*//' | tr -d '"' | tr -d "'" || echo "") HEARTBEAT_INTERVAL=$(grep -E "^\s*interval:" "$CONFIG_FILE" | head -1 | sed 's/.*interval:\s*//' | tr -d '"' | tr -d "'" || echo "") NODE_ID=$(grep -E "^\s*id:" "$CONFIG_FILE" | head -1 | sed 's/.*id:\s*//' | tr -d '"' | tr -d "'" || echo "") NODE_IP=$(grep -E "^\s*ip:" "$CONFIG_FILE" | head -1 | sed 's/.*ip:\s*//' | tr -d '"' | tr -d "'" || echo "") fi # 确定使用的后端URL if [ -n "$BACKEND_URL" ]; then FINAL_BACKEND_URL="$BACKEND_URL" print_info "使用环境变量 BACKEND_URL: $FINAL_BACKEND_URL" elif [ -n "$BACKEND_URL_FROM_CONFIG" ]; then FINAL_BACKEND_URL="$BACKEND_URL_FROM_CONFIG" print_info "使用配置文件中的后端URL: $FINAL_BACKEND_URL" else FINAL_BACKEND_URL="http://localhost:8080" print_warning "未找到后端URL配置,使用默认值: $FINAL_BACKEND_URL" fi if [ -n "$HEARTBEAT_INTERVAL" ]; then print_info "心跳间隔: ${HEARTBEAT_INTERVAL}秒" else print_info "心跳间隔: 60秒 (默认值)" fi if [ -n "$NODE_ID" ] && [ "$NODE_ID" != "0" ] && [ "$NODE_ID" != "null" ]; then print_success "节点ID已配置: $NODE_ID" else print_warning "节点ID未配置或为0,将在首次心跳时获取" fi if [ -n "$NODE_IP" ] && [ "$NODE_IP" != "null" ]; then print_success "节点IP已配置: $NODE_IP" else print_warning "节点IP未配置,将在首次心跳时获取" fi export FINAL_BACKEND_URL } # 3. 检查网络连接 check_network() { print_check_title "检查网络连接" if [ -z "$FINAL_BACKEND_URL" ]; then print_error "无法确定后端URL,跳过网络检查" return 1 fi # 提取主机和端口 BACKEND_HOST=$(echo "$FINAL_BACKEND_URL" | sed -E 's|https?://||' | cut -d'/' -f1 | cut -d':' -f1) BACKEND_PORT=$(echo "$FINAL_BACKEND_URL" | sed -E 's|https?://||' | cut -d'/' -f1 | cut -d':' -f2) if [ -z "$BACKEND_PORT" ]; then if echo "$FINAL_BACKEND_URL" | grep -q "https://"; then BACKEND_PORT=443 else BACKEND_PORT=80 fi fi print_info "后端地址: $BACKEND_HOST:$BACKEND_PORT" # 检查DNS解析 if command -v nslookup > /dev/null 2>&1 || command -v host > /dev/null 2>&1; then if command -v nslookup > /dev/null 2>&1; then if nslookup "$BACKEND_HOST" > /dev/null 2>&1; then print_success "DNS解析成功: $BACKEND_HOST" else print_error "DNS解析失败: $BACKEND_HOST" return 1 fi elif command -v host > /dev/null 2>&1; then if host "$BACKEND_HOST" > /dev/null 2>&1; then print_success "DNS解析成功: $BACKEND_HOST" else print_error "DNS解析失败: $BACKEND_HOST" return 1 fi fi fi # 检查端口连通性 if command -v nc > /dev/null 2>&1; then if nc -z -w 3 "$BACKEND_HOST" "$BACKEND_PORT" 2>/dev/null; then print_success "端口连通性检查通过: $BACKEND_HOST:$BACKEND_PORT" else print_error "端口无法连接: $BACKEND_HOST:$BACKEND_PORT" print_info "可能原因: 防火墙阻止、后端服务未启动、网络不通" return 1 fi elif command -v timeout > /dev/null 2>&1 && command -v bash > /dev/null 2>&1; then # 使用 bash 内置的 TCP 连接测试 if timeout 3 bash -c "echo > /dev/tcp/$BACKEND_HOST/$BACKEND_PORT" 2>/dev/null; then print_success "端口连通性检查通过: $BACKEND_HOST:$BACKEND_PORT" else print_error "端口无法连接: $BACKEND_HOST:$BACKEND_PORT" print_info "可能原因: 防火墙阻止、后端服务未启动、网络不通" return 1 fi else print_warning "无法检查端口连通性(需要 nc 或 timeout 命令)" fi # 检查HTTP连接 HEARTBEAT_URL="${FINAL_BACKEND_URL%/}/api/node/heartbeat" print_info "测试心跳接口: $HEARTBEAT_URL" if command -v curl > /dev/null 2>&1; then HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 --max-time 10 \ -X POST \ -H "Content-Type: application/x-www-form-urlencoded" \ -d "type=pingServer" \ "$HEARTBEAT_URL" 2>/dev/null || echo "000") if [ "$HTTP_CODE" = "200" ]; then print_success "心跳接口响应正常 (HTTP 200)" elif [ "$HTTP_CODE" = "000" ]; then print_error "无法连接到心跳接口" print_info "可能原因: 网络不通、后端服务未启动、防火墙阻止" return 1 else print_warning "心跳接口返回异常状态码: HTTP $HTTP_CODE" print_info "这可能是正常的,取决于后端实现" fi elif command -v wget > /dev/null 2>&1; then HTTP_CODE=$(wget --spider --server-response --timeout=5 --tries=1 \ --post-data="type=pingServer" \ --header="Content-Type: application/x-www-form-urlencoded" \ "$HEARTBEAT_URL" 2>&1 | grep -E "HTTP/" | tail -1 | awk '{print $2}' || echo "000") if [ "$HTTP_CODE" = "200" ]; then print_success "心跳接口响应正常 (HTTP 200)" elif [ "$HTTP_CODE" = "000" ]; then print_error "无法连接到心跳接口" return 1 else print_warning "心跳接口返回异常状态码: HTTP $HTTP_CODE" fi else print_warning "无法测试HTTP连接(需要 curl 或 wget 命令)" fi return 0 } # 4. 检查日志 check_logs() { print_check_title "检查日志文件" if [ ! -f "$LOG_FILE" ]; then print_warning "日志文件不存在: $LOG_FILE" print_info "如果服务刚启动,日志文件可能还未创建" return 0 fi print_success "日志文件存在: $LOG_FILE" # 检查日志文件大小 LOG_SIZE=$(stat -f%z "$LOG_FILE" 2>/dev/null || stat -c%s "$LOG_FILE" 2>/dev/null || echo "0") if [ "$LOG_SIZE" -gt 10485760 ]; then print_warning "日志文件较大: $(($LOG_SIZE / 1024 / 1024))MB" fi # 检查最近的心跳记录 print_info "查找最近的心跳记录..." HEARTBEAT_SUCCESS=$(grep -i "心跳发送成功\|heartbeat.*success\|心跳响应" "$LOG_FILE" 2>/dev/null | tail -5 || true) HEARTBEAT_FAILED=$(grep -i "心跳发送失败\|heartbeat.*fail\|发送心跳失败" "$LOG_FILE" 2>/dev/null | tail -5 || true) HEARTBEAT_ERROR=$(grep -i "error.*heartbeat\|心跳.*error" "$LOG_FILE" 2>/dev/null | tail -5 || true) if [ -n "$HEARTBEAT_SUCCESS" ]; then echo -e "${GREEN}最近成功的心跳记录:${NC}" echo "$HEARTBEAT_SUCCESS" | while IFS= read -r line; do echo " $line" done fi if [ -n "$HEARTBEAT_FAILED" ]; then echo -e "${YELLOW}最近失败的心跳记录:${NC}" echo "$HEARTBEAT_FAILED" | while IFS= read -r line; do echo " $line" done ((WARNINGS++)) fi if [ -n "$HEARTBEAT_ERROR" ]; then echo -e "${RED}最近的心跳错误记录:${NC}" echo "$HEARTBEAT_ERROR" | while IFS= read -r line; do echo " $line" done ((ISSUES++)) fi # 检查最近的错误 RECENT_ERRORS=$(grep -i "error\|fail\|panic" "$LOG_FILE" 2>/dev/null | tail -10 || true) if [ -n "$RECENT_ERRORS" ]; then echo -e "${YELLOW}最近的错误记录(最后10条):${NC}" echo "$RECENT_ERRORS" | while IFS= read -r line; do echo " $line" done fi # 检查最后的心跳时间 LAST_HEARTBEAT=$(grep -i "心跳" "$LOG_FILE" 2>/dev/null | tail -1 || true) if [ -n "$LAST_HEARTBEAT" ]; then print_info "最后的心跳日志: $LAST_HEARTBEAT" else print_warning "日志中未找到心跳记录" fi } # 5. 手动测试心跳 test_heartbeat() { print_check_title "手动测试心跳发送" if [ -z "$FINAL_BACKEND_URL" ]; then print_error "无法确定后端URL,跳过心跳测试" return 1 fi HEARTBEAT_URL="${FINAL_BACKEND_URL%/}/api/node/heartbeat" print_info "发送测试心跳到: $HEARTBEAT_URL" if command -v curl > /dev/null 2>&1; then RESPONSE=$(curl -s -w "\n%{http_code}" --connect-timeout 10 --max-time 15 \ -X POST \ -H "Content-Type: application/x-www-form-urlencoded" \ -d "type=pingServer" \ "$HEARTBEAT_URL" 2>&1) HTTP_CODE=$(echo "$RESPONSE" | tail -1) BODY=$(echo "$RESPONSE" | sed '$d') if [ "$HTTP_CODE" = "200" ]; then print_success "心跳发送成功 (HTTP 200)" if [ -n "$BODY" ]; then print_info "响应内容: $BODY" # 尝试解析JSON响应 if echo "$BODY" | grep -q "node_id\|node_ip"; then print_success "响应包含节点信息" echo "$BODY" | grep -o '"node_id":[0-9]*\|"node_ip":"[^"]*"' 2>/dev/null || true fi fi else print_error "心跳发送失败 (HTTP $HTTP_CODE)" if [ -n "$BODY" ]; then print_info "响应内容: $BODY" fi return 1 fi elif command -v wget > /dev/null 2>&1; then RESPONSE=$(wget -qO- --post-data="type=pingServer" \ --header="Content-Type: application/x-www-form-urlencoded" \ --timeout=15 \ "$HEARTBEAT_URL" 2>&1) if [ $? -eq 0 ]; then print_success "心跳发送成功" if [ -n "$RESPONSE" ]; then print_info "响应内容: $RESPONSE" fi else print_error "心跳发送失败" return 1 fi else print_warning "无法测试心跳(需要 curl 或 wget 命令)" return 1 fi return 0 } # 6. 检查系统资源 check_resources() { print_check_title "检查系统资源" # 检查磁盘空间 if command -v df > /dev/null 2>&1; then DISK_USAGE=$(df -h . | tail -1 | awk '{print $5}' | sed 's/%//') if [ "$DISK_USAGE" -gt 90 ]; then print_error "磁盘空间不足: ${DISK_USAGE}%" elif [ "$DISK_USAGE" -gt 80 ]; then print_warning "磁盘空间紧张: ${DISK_USAGE}%" else print_success "磁盘空间充足: ${DISK_USAGE}%" fi fi # 检查内存 if command -v free > /dev/null 2>&1; then MEM_INFO=$(free -m | grep Mem) MEM_TOTAL=$(echo "$MEM_INFO" | awk '{print $2}') MEM_AVAIL=$(echo "$MEM_INFO" | awk '{print $7}') if [ -z "$MEM_AVAIL" ]; then MEM_AVAIL=$(echo "$MEM_INFO" | awk '{print $4}') fi if [ -n "$MEM_TOTAL" ] && [ -n "$MEM_AVAIL" ]; then MEM_PERCENT=$((MEM_AVAIL * 100 / MEM_TOTAL)) if [ "$MEM_PERCENT" -lt 10 ]; then print_error "可用内存不足: ${MEM_AVAIL}MB / ${MEM_TOTAL}MB (${MEM_PERCENT}%)" elif [ "$MEM_PERCENT" -lt 20 ]; then print_warning "可用内存紧张: ${MEM_AVAIL}MB / ${MEM_TOTAL}MB (${MEM_PERCENT}%)" else print_success "内存充足: ${MEM_AVAIL}MB / ${MEM_TOTAL}MB (${MEM_PERCENT}%)" fi fi fi } # 主函数 main() { echo -e "${CYAN}" echo "========================================" echo " LinkMaster 节点心跳故障排查工具" echo "========================================" echo -e "${NC}" # 执行各项检查 check_process PROCESS_OK=$? check_config if [ $PROCESS_OK -eq 0 ]; then check_network NETWORK_OK=$? check_logs if [ $NETWORK_OK -eq 0 ]; then echo "" read -p "是否执行手动心跳测试? (y/N): " -n 1 -r echo "" if [[ $REPLY =~ ^[Yy]$ ]]; then test_heartbeat fi fi fi check_resources # 总结 print_separator echo -e "\n${BLUE}排查总结:${NC}" if [ $ISSUES -eq 0 ] && [ $WARNINGS -eq 0 ]; then echo -e "${GREEN}✓ 未发现明显问题${NC}" echo -e "${CYAN}如果心跳仍然无法同步,请检查:${NC}" echo " 1. 后端服务是否正常运行" echo " 2. 后端数据库是否正常" echo " 3. 防火墙规则是否正确配置" echo " 4. 查看完整日志: ./run.sh logs-all" else if [ $ISSUES -gt 0 ]; then echo -e "${RED}发现 $ISSUES 个严重问题${NC}" fi if [ $WARNINGS -gt 0 ]; then echo -e "${YELLOW}发现 $WARNINGS 个警告${NC}" fi echo -e "\n${CYAN}建议操作:${NC}" echo " 1. 根据上述检查结果修复问题" echo " 2. 重启服务: ./run.sh restart" echo " 3. 查看实时日志: ./run.sh logs" echo " 4. 查看完整日志: ./run.sh logs-all" fi print_separator } # 运行主函数 main