jecloud java进程jvm监控脚本
尝试用参数介绍:
- ENV_NAME 制定环境名称
- PROCESS_NAMES 指定监控的进程(进程要唯一)
- RESTART_ENABLE 指定是否开启重启
- JEINIT_DIR 指定jeinit的目录
- EMAIL_ENABLE 指定是否开启邮件通知
#!/bin/bash
# ---------------------- 配置参数 ----------------------
CHECK_INTERVAL=200 # 监控间隔(秒)
OLD_GEN_THRESHOLD=85 # 老年代告警阈值(百分比)
META_SPACE_THRESHOLD=95 # 元空间告警阈值(百分比)
LOG_FILE="/var/log/jecloud_jvm_monitor.log" # 监控日志路径
ENV_NAME="jecloud-dev" #制定环境名称
# 配置多个监控进程(使用唯一标识)
PROCESS_NAMES=(
"/data/jecloud/jecloud-meta"
"/data/jecloud/jecloud-rbac"
"/data/jecloud/jecloud-workflow"
"/data/jecloud/jecloud-gateway"
"/data/jecloud/jecloud-demo"
"/data/jecloud/jecloud-document"
"/data/jecloud/jecloud-connector"
"/data/jecloud/jecloud-message"
# 添加更多进程...
)
# 开启重启
RESTART_ENABLE=true
JEINIT_DIR=/root/jecloud-install-v3.0.8 # 配置jeinit路径
# 邮件告警配置
EMAIL_ENABLE=true
SMTP_SERVER="smtp.qiye.aliyun.com"
SMTP_PORT="465"
EMAIL_FROM="suanbanyun@china-hbp.com"
EMAIL_TO="zhaihx@china-hbp.com"
EMAIL_PASS="123" # 建议使用应用专用密码
# ---------------------- 全局状态 ----------------------
declare -A CONSECUTIVE_GC_COUNT # 进程PID->连续清理次数
# ---------------------- 函数定义 ----------------------
# 从进程路径中提取服务标识符
get_service_id() {
local path="$1"
# 提取最后一个斜杠后的部分
local base_name="${path##*/}"
# 提取"-"之后的部分作为服务ID
echo "${base_name#*-}"
}
# 获取目标进程的所有PID
get_pids() {
local pname="$1"
pgrep -f "$pname"
}
# 发送告警邮件
send_alert() {
local pname="$1"
local pid="$2"
local old_gen="$3"
local meta_space="$4"
local subject="$ENV_NAME-JVM内存告警: $pname (PID: $pid)"
local body="检测到进程 $pname (PID: $pid) 内存使用率超限!\n\n当前状态:\n老年代(O): ${old_gen}%\n元空间(M): ${meta_space}%\n 可以设置自动重启\n 时间: $(date "+%Y-%m-%d %H:%M:%S")"
local mail_file=$(mktemp)
local to_header=$(echo "$EMAIL_TO" | tr ' ' ',')
echo -e "From: $EMAIL_FROM\nTo: $to_header\nSubject: $subject\n\n$body" > "$mail_file"
local curl_cmd="curl -v -s --url 'smtps://$SMTP_SERVER:$SMTP_PORT' --ssl-reqd --user '$EMAIL_FROM:$EMAIL_PASS' --mail-from '$EMAIL_FROM'"
for to in $EMAIL_TO; do
curl_cmd+=" --mail-rcpt '$to'"
done
curl_cmd+=" --upload-file '$mail_file'"
eval "$curl_cmd"
rm -f "$mail_file"
}
# 执行内存清理操作
perform_gc_clean() {
local pname="$1"
local pid="$2"
local timestamp="$3"
local old_gen="$4"
local meta_space="$5"
if ! ps -p "$pid" > /dev/null; then
echo "[$timestamp][$pname] ERROR: 进程 $pid 不存在" >> "$LOG_FILE"
return 1
fi
jcmd "$pid" GC.run >/dev/null 2>&1
local ret=$?
if [ $ret -eq 0 ]; then
echo "[$timestamp][$pname] SUCCESS: 执行内存清理 (PID:$pid, O:${old_gen}% M:${meta_space}%)" >> "$LOG_FILE"
return 0
else
echo "[$timestamp][$pname] ERROR: 内存清理失败 (PID:$pid, 状态码:$ret)" >> "$LOG_FILE"
return 1
fi
}
# 监控JVM内存
monitor_jvm() {
local pname="$1"
local pid="$2"
local gcutil=$(jstat -gcutil "$pid" 2>/dev/null | tail -1)
local timestamp=$(date "+%Y-%m-%d %H:%M:%S")
# 检查jstat输出是否有效
if [ -z "$gcutil" ] || [ $(echo "$gcutil" | wc -w) -lt 5 ]; then
echo "[$timestamp][$pname] ERROR: 获取JVM状态失败 (PID:$pid)" >> "$LOG_FILE"
return
fi
local old_gen=$(echo "$gcutil" | awk '{print $4}')
local meta_space=$(echo "$gcutil" | awk '{print $5}')
local gc_count=${CONSECUTIVE_GC_COUNT[$pid]:-0}
echo "[$timestamp][$pname] PID=$pid | O=$old_gen% | M=$meta_space% | GC计数=$gc_count" >> "$LOG_FILE"
# 检查内存阈值
local exceeded=0
if (( $(echo "$old_gen >= $OLD_GEN_THRESHOLD" | bc -l) )); then
exceeded=1
echo "[$timestamp][$pname] WARNING: 老年代使用率 ${old_gen}% ≥ ${OLD_GEN_THRESHOLD}%!" >> "$LOG_FILE"
fi
if (( $(echo "$meta_space >= $META_SPACE_THRESHOLD" | bc -l) )); then
exceeded=1
echo "[$timestamp][$pname] WARNING: 元空间使用率 ${meta_space}% ≥ ${META_SPACE_THRESHOLD}%!" >> "$LOG_FILE"
fi
# 处理超限情况
if [ $exceeded -eq 1 ]; then
if [ $gc_count -lt 3 ]; then
if perform_gc_clean "$pname" "$pid" "$timestamp" "$old_gen" "$meta_space"; then
CONSECUTIVE_GC_COUNT[$pid]=$((gc_count + 1))
fi
else
echo "[$timestamp][$pname] ALERT: 已连续执行3次清理,发送告警" >> "$LOG_FILE"
if [ "$EMAIL_ENABLE" = true ]; then
send_alert "$pname" "$pid" "$old_gen" "$meta_space"
fi
local service_id=$(get_service_id "$pname")
if [ "$RESTART_ENABLE" = true ]; then
cd $JEINIT_DIR
echo "./jeinit restart --name $service_id" >> "$LOG_FILE"
./jeinit restart --name "$service_id" 2>&1 | tee -a "$LOG_FILE"
fi
CONSECUTIVE_GC_COUNT[$pid]=0
fi
else
# 内存正常时重置计数器
CONSECUTIVE_GC_COUNT[$pid]=0
fi
}
# ---------------------- 主程序 ----------------------
while true; do
TIMESTAMP=$(date "+%Y-%m-%d %H:%M:%S")
for process_name in "${PROCESS_NAMES[@]}"; do
PIDS=$(get_pids "$process_name")
if [ -z "$PIDS" ]; then
echo "[$TIMESTAMP][$process_name] ERROR: 未找到进程" >> "$LOG_FILE"
continue
fi
for pid in $PIDS; do
monitor_jvm "$process_name" "$pid"
done
done
sleep "$CHECK_INTERVAL"
done
最后编辑: 翟厚翔 文档更新时间: 2025-06-27 17:19 作者:翟厚翔