feat: 添加日志文件输出功能和心跳故障排查工具

- 新增日志文件输出功能,支持配置日志文件路径和级别
- 添加心跳故障排查脚本 check-heartbeat.sh
- 支持通过环境变量 LOG_FILE 设置日志文件路径
- 日志自动创建目录,支持相对路径和绝对路径
- 优化日志初始化逻辑,支持直接写入文件
- 改进配置加载,支持日志配置项
- 完善文档,添加故障排查章节和日志功能说明
- 更新版本号至 v1.1.0
This commit is contained in:
2025-12-07 16:37:03 +08:00
parent 74c1db2f14
commit d8ea772c24
5 changed files with 745 additions and 13 deletions

512
check-heartbeat.sh Executable file
View File

@@ -0,0 +1,512 @@
#!/bin/bash
# ============================================
# LinkMaster 节点心跳故障排查脚本
# 用途:诊断节点心跳同步问题
# ============================================
set -e
# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color
# 脚本目录
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
# 配置
BINARY_NAME="agent"
LOG_FILE="node.log"
PID_FILE="node.pid"
CONFIG_FILE="${CONFIG_PATH:-config.yaml}"
# 检查结果
ISSUES=0
WARNINGS=0
# 打印分隔线
print_separator() {
echo -e "${CYAN}========================================${NC}"
}
# 打印检查项标题
print_check_title() {
echo -e "\n${BLUE}$1${NC}"
}
# 打印成功信息
print_success() {
echo -e "${GREEN}$1${NC}"
}
# 打印警告信息
print_warning() {
echo -e "${YELLOW}$1${NC}"
((WARNINGS++))
}
# 打印错误信息
print_error() {
echo -e "${RED}$1${NC}"
((ISSUES++))
}
# 打印信息
print_info() {
echo -e "${CYAN} $1${NC}"
}
# 获取PID
get_pid() {
if [ -f "$PID_FILE" ]; then
PID=$(cat "$PID_FILE")
if ps -p "$PID" > /dev/null 2>&1; then
echo "$PID"
else
rm -f "$PID_FILE"
echo ""
fi
else
echo ""
fi
}
# 1. 检查进程状态
check_process() {
print_check_title "检查进程状态"
PID=$(get_pid)
if [ -z "$PID" ]; then
print_error "节点进程未运行"
print_info "请使用 ./run.sh start 启动服务"
return 1
else
print_success "节点进程正在运行 (PID: $PID)"
# 检查进程运行时间
if command -v ps > /dev/null 2>&1; then
RUNTIME=$(ps -o etime= -p "$PID" 2>/dev/null | tr -d ' ')
if [ -n "$RUNTIME" ]; then
print_info "进程运行时间: $RUNTIME"
fi
fi
# 检查进程资源使用
if command -v ps > /dev/null 2>&1; then
CPU_MEM=$(ps -o %cpu,%mem= -p "$PID" 2>/dev/null | tr -d ' ')
if [ -n "$CPU_MEM" ]; then
print_info "CPU/内存使用: $CPU_MEM"
fi
fi
return 0
fi
}
# 2. 检查配置文件
check_config() {
print_check_title "检查配置文件"
if [ ! -f "$CONFIG_FILE" ]; then
print_warning "配置文件不存在: $CONFIG_FILE"
print_info "将使用环境变量和默认配置"
# 检查环境变量
if [ -n "$BACKEND_URL" ]; then
print_info "使用环境变量 BACKEND_URL: $BACKEND_URL"
else
print_warning "未设置 BACKEND_URL 环境变量,将使用默认值: http://localhost:8080"
fi
return 0
fi
print_success "配置文件存在: $CONFIG_FILE"
# 检查配置文件内容
if command -v yq > /dev/null 2>&1; then
BACKEND_URL_FROM_CONFIG=$(yq eval '.backend.url' "$CONFIG_FILE" 2>/dev/null || echo "")
HEARTBEAT_INTERVAL=$(yq eval '.heartbeat.interval' "$CONFIG_FILE" 2>/dev/null || echo "")
NODE_ID=$(yq eval '.node.id' "$CONFIG_FILE" 2>/dev/null || echo "")
NODE_IP=$(yq eval '.node.ip' "$CONFIG_FILE" 2>/dev/null || echo "")
else
# 使用 grep 和 sed 简单解析
BACKEND_URL_FROM_CONFIG=$(grep -E "^\s*url:" "$CONFIG_FILE" | head -1 | sed 's/.*url:\s*//' | tr -d '"' | tr -d "'" || echo "")
HEARTBEAT_INTERVAL=$(grep -E "^\s*interval:" "$CONFIG_FILE" | head -1 | sed 's/.*interval:\s*//' | tr -d '"' | tr -d "'" || echo "")
NODE_ID=$(grep -E "^\s*id:" "$CONFIG_FILE" | head -1 | sed 's/.*id:\s*//' | tr -d '"' | tr -d "'" || echo "")
NODE_IP=$(grep -E "^\s*ip:" "$CONFIG_FILE" | head -1 | sed 's/.*ip:\s*//' | tr -d '"' | tr -d "'" || echo "")
fi
# 确定使用的后端URL
if [ -n "$BACKEND_URL" ]; then
FINAL_BACKEND_URL="$BACKEND_URL"
print_info "使用环境变量 BACKEND_URL: $FINAL_BACKEND_URL"
elif [ -n "$BACKEND_URL_FROM_CONFIG" ]; then
FINAL_BACKEND_URL="$BACKEND_URL_FROM_CONFIG"
print_info "使用配置文件中的后端URL: $FINAL_BACKEND_URL"
else
FINAL_BACKEND_URL="http://localhost:8080"
print_warning "未找到后端URL配置使用默认值: $FINAL_BACKEND_URL"
fi
if [ -n "$HEARTBEAT_INTERVAL" ]; then
print_info "心跳间隔: ${HEARTBEAT_INTERVAL}"
else
print_info "心跳间隔: 60秒 (默认值)"
fi
if [ -n "$NODE_ID" ] && [ "$NODE_ID" != "0" ] && [ "$NODE_ID" != "null" ]; then
print_success "节点ID已配置: $NODE_ID"
else
print_warning "节点ID未配置或为0将在首次心跳时获取"
fi
if [ -n "$NODE_IP" ] && [ "$NODE_IP" != "null" ]; then
print_success "节点IP已配置: $NODE_IP"
else
print_warning "节点IP未配置将在首次心跳时获取"
fi
export FINAL_BACKEND_URL
}
# 3. 检查网络连接
check_network() {
print_check_title "检查网络连接"
if [ -z "$FINAL_BACKEND_URL" ]; then
print_error "无法确定后端URL跳过网络检查"
return 1
fi
# 提取主机和端口
BACKEND_HOST=$(echo "$FINAL_BACKEND_URL" | sed -E 's|https?://||' | cut -d'/' -f1 | cut -d':' -f1)
BACKEND_PORT=$(echo "$FINAL_BACKEND_URL" | sed -E 's|https?://||' | cut -d'/' -f1 | cut -d':' -f2)
if [ -z "$BACKEND_PORT" ]; then
if echo "$FINAL_BACKEND_URL" | grep -q "https://"; then
BACKEND_PORT=443
else
BACKEND_PORT=80
fi
fi
print_info "后端地址: $BACKEND_HOST:$BACKEND_PORT"
# 检查DNS解析
if command -v nslookup > /dev/null 2>&1 || command -v host > /dev/null 2>&1; then
if command -v nslookup > /dev/null 2>&1; then
if nslookup "$BACKEND_HOST" > /dev/null 2>&1; then
print_success "DNS解析成功: $BACKEND_HOST"
else
print_error "DNS解析失败: $BACKEND_HOST"
return 1
fi
elif command -v host > /dev/null 2>&1; then
if host "$BACKEND_HOST" > /dev/null 2>&1; then
print_success "DNS解析成功: $BACKEND_HOST"
else
print_error "DNS解析失败: $BACKEND_HOST"
return 1
fi
fi
fi
# 检查端口连通性
if command -v nc > /dev/null 2>&1; then
if nc -z -w 3 "$BACKEND_HOST" "$BACKEND_PORT" 2>/dev/null; then
print_success "端口连通性检查通过: $BACKEND_HOST:$BACKEND_PORT"
else
print_error "端口无法连接: $BACKEND_HOST:$BACKEND_PORT"
print_info "可能原因: 防火墙阻止、后端服务未启动、网络不通"
return 1
fi
elif command -v timeout > /dev/null 2>&1 && command -v bash > /dev/null 2>&1; then
# 使用 bash 内置的 TCP 连接测试
if timeout 3 bash -c "echo > /dev/tcp/$BACKEND_HOST/$BACKEND_PORT" 2>/dev/null; then
print_success "端口连通性检查通过: $BACKEND_HOST:$BACKEND_PORT"
else
print_error "端口无法连接: $BACKEND_HOST:$BACKEND_PORT"
print_info "可能原因: 防火墙阻止、后端服务未启动、网络不通"
return 1
fi
else
print_warning "无法检查端口连通性(需要 nc 或 timeout 命令)"
fi
# 检查HTTP连接
HEARTBEAT_URL="${FINAL_BACKEND_URL%/}/api/node/heartbeat"
print_info "测试心跳接口: $HEARTBEAT_URL"
if command -v curl > /dev/null 2>&1; then
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 --max-time 10 \
-X POST \
-H "Content-Type: application/x-www-form-urlencoded" \
-d "type=pingServer" \
"$HEARTBEAT_URL" 2>/dev/null || echo "000")
if [ "$HTTP_CODE" = "200" ]; then
print_success "心跳接口响应正常 (HTTP 200)"
elif [ "$HTTP_CODE" = "000" ]; then
print_error "无法连接到心跳接口"
print_info "可能原因: 网络不通、后端服务未启动、防火墙阻止"
return 1
else
print_warning "心跳接口返回异常状态码: HTTP $HTTP_CODE"
print_info "这可能是正常的,取决于后端实现"
fi
elif command -v wget > /dev/null 2>&1; then
HTTP_CODE=$(wget --spider --server-response --timeout=5 --tries=1 \
--post-data="type=pingServer" \
--header="Content-Type: application/x-www-form-urlencoded" \
"$HEARTBEAT_URL" 2>&1 | grep -E "HTTP/" | tail -1 | awk '{print $2}' || echo "000")
if [ "$HTTP_CODE" = "200" ]; then
print_success "心跳接口响应正常 (HTTP 200)"
elif [ "$HTTP_CODE" = "000" ]; then
print_error "无法连接到心跳接口"
return 1
else
print_warning "心跳接口返回异常状态码: HTTP $HTTP_CODE"
fi
else
print_warning "无法测试HTTP连接需要 curl 或 wget 命令)"
fi
return 0
}
# 4. 检查日志
check_logs() {
print_check_title "检查日志文件"
if [ ! -f "$LOG_FILE" ]; then
print_warning "日志文件不存在: $LOG_FILE"
print_info "如果服务刚启动,日志文件可能还未创建"
return 0
fi
print_success "日志文件存在: $LOG_FILE"
# 检查日志文件大小
LOG_SIZE=$(stat -f%z "$LOG_FILE" 2>/dev/null || stat -c%s "$LOG_FILE" 2>/dev/null || echo "0")
if [ "$LOG_SIZE" -gt 10485760 ]; then
print_warning "日志文件较大: $(($LOG_SIZE / 1024 / 1024))MB"
fi
# 检查最近的心跳记录
print_info "查找最近的心跳记录..."
HEARTBEAT_SUCCESS=$(grep -i "心跳发送成功\|heartbeat.*success\|心跳响应" "$LOG_FILE" 2>/dev/null | tail -5 || true)
HEARTBEAT_FAILED=$(grep -i "心跳发送失败\|heartbeat.*fail\|发送心跳失败" "$LOG_FILE" 2>/dev/null | tail -5 || true)
HEARTBEAT_ERROR=$(grep -i "error.*heartbeat\|心跳.*error" "$LOG_FILE" 2>/dev/null | tail -5 || true)
if [ -n "$HEARTBEAT_SUCCESS" ]; then
echo -e "${GREEN}最近成功的心跳记录:${NC}"
echo "$HEARTBEAT_SUCCESS" | while IFS= read -r line; do
echo " $line"
done
fi
if [ -n "$HEARTBEAT_FAILED" ]; then
echo -e "${YELLOW}最近失败的心跳记录:${NC}"
echo "$HEARTBEAT_FAILED" | while IFS= read -r line; do
echo " $line"
done
((WARNINGS++))
fi
if [ -n "$HEARTBEAT_ERROR" ]; then
echo -e "${RED}最近的心跳错误记录:${NC}"
echo "$HEARTBEAT_ERROR" | while IFS= read -r line; do
echo " $line"
done
((ISSUES++))
fi
# 检查最近的错误
RECENT_ERRORS=$(grep -i "error\|fail\|panic" "$LOG_FILE" 2>/dev/null | tail -10 || true)
if [ -n "$RECENT_ERRORS" ]; then
echo -e "${YELLOW}最近的错误记录最后10条:${NC}"
echo "$RECENT_ERRORS" | while IFS= read -r line; do
echo " $line"
done
fi
# 检查最后的心跳时间
LAST_HEARTBEAT=$(grep -i "心跳" "$LOG_FILE" 2>/dev/null | tail -1 || true)
if [ -n "$LAST_HEARTBEAT" ]; then
print_info "最后的心跳日志: $LAST_HEARTBEAT"
else
print_warning "日志中未找到心跳记录"
fi
}
# 5. 手动测试心跳
test_heartbeat() {
print_check_title "手动测试心跳发送"
if [ -z "$FINAL_BACKEND_URL" ]; then
print_error "无法确定后端URL跳过心跳测试"
return 1
fi
HEARTBEAT_URL="${FINAL_BACKEND_URL%/}/api/node/heartbeat"
print_info "发送测试心跳到: $HEARTBEAT_URL"
if command -v curl > /dev/null 2>&1; then
RESPONSE=$(curl -s -w "\n%{http_code}" --connect-timeout 10 --max-time 15 \
-X POST \
-H "Content-Type: application/x-www-form-urlencoded" \
-d "type=pingServer" \
"$HEARTBEAT_URL" 2>&1)
HTTP_CODE=$(echo "$RESPONSE" | tail -1)
BODY=$(echo "$RESPONSE" | sed '$d')
if [ "$HTTP_CODE" = "200" ]; then
print_success "心跳发送成功 (HTTP 200)"
if [ -n "$BODY" ]; then
print_info "响应内容: $BODY"
# 尝试解析JSON响应
if echo "$BODY" | grep -q "node_id\|node_ip"; then
print_success "响应包含节点信息"
echo "$BODY" | grep -o '"node_id":[0-9]*\|"node_ip":"[^"]*"' 2>/dev/null || true
fi
fi
else
print_error "心跳发送失败 (HTTP $HTTP_CODE)"
if [ -n "$BODY" ]; then
print_info "响应内容: $BODY"
fi
return 1
fi
elif command -v wget > /dev/null 2>&1; then
RESPONSE=$(wget -qO- --post-data="type=pingServer" \
--header="Content-Type: application/x-www-form-urlencoded" \
--timeout=15 \
"$HEARTBEAT_URL" 2>&1)
if [ $? -eq 0 ]; then
print_success "心跳发送成功"
if [ -n "$RESPONSE" ]; then
print_info "响应内容: $RESPONSE"
fi
else
print_error "心跳发送失败"
return 1
fi
else
print_warning "无法测试心跳(需要 curl 或 wget 命令)"
return 1
fi
return 0
}
# 6. 检查系统资源
check_resources() {
print_check_title "检查系统资源"
# 检查磁盘空间
if command -v df > /dev/null 2>&1; then
DISK_USAGE=$(df -h . | tail -1 | awk '{print $5}' | sed 's/%//')
if [ "$DISK_USAGE" -gt 90 ]; then
print_error "磁盘空间不足: ${DISK_USAGE}%"
elif [ "$DISK_USAGE" -gt 80 ]; then
print_warning "磁盘空间紧张: ${DISK_USAGE}%"
else
print_success "磁盘空间充足: ${DISK_USAGE}%"
fi
fi
# 检查内存
if command -v free > /dev/null 2>&1; then
MEM_INFO=$(free -m | grep Mem)
MEM_TOTAL=$(echo "$MEM_INFO" | awk '{print $2}')
MEM_AVAIL=$(echo "$MEM_INFO" | awk '{print $7}')
if [ -z "$MEM_AVAIL" ]; then
MEM_AVAIL=$(echo "$MEM_INFO" | awk '{print $4}')
fi
if [ -n "$MEM_TOTAL" ] && [ -n "$MEM_AVAIL" ]; then
MEM_PERCENT=$((MEM_AVAIL * 100 / MEM_TOTAL))
if [ "$MEM_PERCENT" -lt 10 ]; then
print_error "可用内存不足: ${MEM_AVAIL}MB / ${MEM_TOTAL}MB (${MEM_PERCENT}%)"
elif [ "$MEM_PERCENT" -lt 20 ]; then
print_warning "可用内存紧张: ${MEM_AVAIL}MB / ${MEM_TOTAL}MB (${MEM_PERCENT}%)"
else
print_success "内存充足: ${MEM_AVAIL}MB / ${MEM_TOTAL}MB (${MEM_PERCENT}%)"
fi
fi
fi
}
# 主函数
main() {
echo -e "${CYAN}"
echo "========================================"
echo " LinkMaster 节点心跳故障排查工具"
echo "========================================"
echo -e "${NC}"
# 执行各项检查
check_process
PROCESS_OK=$?
check_config
if [ $PROCESS_OK -eq 0 ]; then
check_network
NETWORK_OK=$?
check_logs
if [ $NETWORK_OK -eq 0 ]; then
echo ""
read -p "是否执行手动心跳测试? (y/N): " -n 1 -r
echo ""
if [[ $REPLY =~ ^[Yy]$ ]]; then
test_heartbeat
fi
fi
fi
check_resources
# 总结
print_separator
echo -e "\n${BLUE}排查总结:${NC}"
if [ $ISSUES -eq 0 ] && [ $WARNINGS -eq 0 ]; then
echo -e "${GREEN}✓ 未发现明显问题${NC}"
echo -e "${CYAN}如果心跳仍然无法同步,请检查:${NC}"
echo " 1. 后端服务是否正常运行"
echo " 2. 后端数据库是否正常"
echo " 3. 防火墙规则是否正确配置"
echo " 4. 查看完整日志: ./run.sh logs-all"
else
if [ $ISSUES -gt 0 ]; then
echo -e "${RED}发现 $ISSUES 个严重问题${NC}"
fi
if [ $WARNINGS -gt 0 ]; then
echo -e "${YELLOW}发现 $WARNINGS 个警告${NC}"
fi
echo -e "\n${CYAN}建议操作:${NC}"
echo " 1. 根据上述检查结果修复问题"
echo " 2. 重启服务: ./run.sh restart"
echo " 3. 查看实时日志: ./run.sh logs"
echo " 4. 查看完整日志: ./run.sh logs-all"
fi
print_separator
}
# 运行主函数
main