jecloud java进程jvm监控脚本

尝试用参数介绍:

  • ENV_NAME 制定环境名称
  • PROCESS_NAMES 指定监控的进程(进程要唯一)
  • RESTART_ENABLE 指定是否开启重启
  • JEINIT_DIR 指定jeinit的目录
  • EMAIL_ENABLE 指定是否开启邮件通知
#!/bin/bash

# ---------------------- 配置参数 ----------------------
CHECK_INTERVAL=200         # 监控间隔(秒)
OLD_GEN_THRESHOLD=85        # 老年代告警阈值(百分比)
META_SPACE_THRESHOLD=95     # 元空间告警阈值(百分比)
LOG_FILE="/var/log/jecloud_jvm_monitor.log"  # 监控日志路径
ENV_NAME="jecloud-dev" #制定环境名称

# 配置多个监控进程(使用唯一标识)
PROCESS_NAMES=(
    "/data/jecloud/jecloud-meta"
    "/data/jecloud/jecloud-rbac"
    "/data/jecloud/jecloud-workflow"
    "/data/jecloud/jecloud-gateway"
    "/data/jecloud/jecloud-demo"
    "/data/jecloud/jecloud-document"
    "/data/jecloud/jecloud-connector"
    "/data/jecloud/jecloud-message"
    # 添加更多进程...
)

# 开启重启
RESTART_ENABLE=true
JEINIT_DIR=/root/jecloud-install-v3.0.8 # 配置jeinit路径


# 邮件告警配置
EMAIL_ENABLE=true
SMTP_SERVER="smtp.qiye.aliyun.com"
SMTP_PORT="465"
EMAIL_FROM="suanbanyun@china-hbp.com"
EMAIL_TO="zhaihx@china-hbp.com"
EMAIL_PASS="123"  # 建议使用应用专用密码

# ---------------------- 全局状态 ----------------------
declare -A CONSECUTIVE_GC_COUNT  # 进程PID->连续清理次数

# ---------------------- 函数定义 ----------------------

# 从进程路径中提取服务标识符
get_service_id() {
    local path="$1"
    # 提取最后一个斜杠后的部分
    local base_name="${path##*/}"
    # 提取"-"之后的部分作为服务ID
    echo "${base_name#*-}"
}

# 获取目标进程的所有PID
get_pids() {
    local pname="$1"
    pgrep -f "$pname"
}

# 发送告警邮件
send_alert() {
    local pname="$1"
    local pid="$2"
    local old_gen="$3"
    local meta_space="$4"

    local subject="$ENV_NAME-JVM内存告警: $pname (PID: $pid)"
    local body="检测到进程 $pname (PID: $pid) 内存使用率超限!\n\n当前状态:\n老年代(O): ${old_gen}%\n元空间(M): ${meta_space}%\n 可以设置自动重启\n 时间: $(date "+%Y-%m-%d %H:%M:%S")"

    local mail_file=$(mktemp)
    local to_header=$(echo "$EMAIL_TO" | tr ' ' ',')

    echo -e "From: $EMAIL_FROM\nTo: $to_header\nSubject: $subject\n\n$body" > "$mail_file"

    local curl_cmd="curl -v -s --url 'smtps://$SMTP_SERVER:$SMTP_PORT' --ssl-reqd --user '$EMAIL_FROM:$EMAIL_PASS' --mail-from '$EMAIL_FROM'"
    for to in $EMAIL_TO; do
        curl_cmd+=" --mail-rcpt '$to'"
    done
    curl_cmd+=" --upload-file '$mail_file'"

    eval "$curl_cmd"
    rm -f "$mail_file"
}

# 执行内存清理操作
perform_gc_clean() {
    local pname="$1"
    local pid="$2"
    local timestamp="$3"
    local old_gen="$4"
    local meta_space="$5"

    if ! ps -p "$pid" > /dev/null; then
        echo "[$timestamp][$pname] ERROR: 进程 $pid 不存在" >> "$LOG_FILE"
        return 1
    fi

    jcmd "$pid" GC.run >/dev/null 2>&1
    local ret=$?

    if [ $ret -eq 0 ]; then
        echo "[$timestamp][$pname] SUCCESS: 执行内存清理 (PID:$pid, O:${old_gen}% M:${meta_space}%)" >> "$LOG_FILE"
        return 0
    else
        echo "[$timestamp][$pname] ERROR: 内存清理失败 (PID:$pid, 状态码:$ret)" >> "$LOG_FILE"
        return 1
    fi
}

# 监控JVM内存
monitor_jvm() {
    local pname="$1"
    local pid="$2"
    local gcutil=$(jstat -gcutil "$pid" 2>/dev/null | tail -1)
    local timestamp=$(date "+%Y-%m-%d %H:%M:%S")

    # 检查jstat输出是否有效
    if [ -z "$gcutil" ] || [ $(echo "$gcutil" | wc -w) -lt 5 ]; then
        echo "[$timestamp][$pname] ERROR: 获取JVM状态失败 (PID:$pid)" >> "$LOG_FILE"
        return
    fi

    local old_gen=$(echo "$gcutil" | awk '{print $4}')
    local meta_space=$(echo "$gcutil" | awk '{print $5}')
    local gc_count=${CONSECUTIVE_GC_COUNT[$pid]:-0}

    echo "[$timestamp][$pname] PID=$pid | O=$old_gen% | M=$meta_space% | GC计数=$gc_count" >> "$LOG_FILE"

    # 检查内存阈值
    local exceeded=0
    if (( $(echo "$old_gen >= $OLD_GEN_THRESHOLD" | bc -l) )); then
        exceeded=1
        echo "[$timestamp][$pname] WARNING: 老年代使用率 ${old_gen}% ≥ ${OLD_GEN_THRESHOLD}%!" >> "$LOG_FILE"
    fi

    if (( $(echo "$meta_space >= $META_SPACE_THRESHOLD" | bc -l) )); then
        exceeded=1
        echo "[$timestamp][$pname] WARNING: 元空间使用率 ${meta_space}% ≥ ${META_SPACE_THRESHOLD}%!" >> "$LOG_FILE"
    fi

    # 处理超限情况
    if [ $exceeded -eq 1 ]; then
        if [ $gc_count -lt 3 ]; then
            if perform_gc_clean "$pname" "$pid" "$timestamp" "$old_gen" "$meta_space"; then
                CONSECUTIVE_GC_COUNT[$pid]=$((gc_count + 1))
            fi
        else
            echo "[$timestamp][$pname] ALERT: 已连续执行3次清理,发送告警" >> "$LOG_FILE"
            if [ "$EMAIL_ENABLE" = true ]; then
                send_alert "$pname" "$pid" "$old_gen" "$meta_space"
            fi
            local service_id=$(get_service_id "$pname")
            if [ "$RESTART_ENABLE" = true ]; then
                cd $JEINIT_DIR
                echo "./jeinit restart --name $service_id" >> "$LOG_FILE"
                ./jeinit restart --name "$service_id" 2>&1 | tee -a "$LOG_FILE"
            fi
            CONSECUTIVE_GC_COUNT[$pid]=0
        fi
    else
        # 内存正常时重置计数器
        CONSECUTIVE_GC_COUNT[$pid]=0
    fi
}

# ---------------------- 主程序 ----------------------
while true; do
    TIMESTAMP=$(date "+%Y-%m-%d %H:%M:%S")

    for process_name in "${PROCESS_NAMES[@]}"; do
        PIDS=$(get_pids "$process_name")

        if [ -z "$PIDS" ]; then
            echo "[$TIMESTAMP][$process_name] ERROR: 未找到进程" >> "$LOG_FILE"
            continue
        fi

        for pid in $PIDS; do
            monitor_jvm "$process_name" "$pid"
        done
    done

    sleep "$CHECK_INTERVAL"
done
最后编辑: 翟厚翔  文档更新时间: 2025-06-27 17:19   作者:翟厚翔