diff --git a/script/docker/app/deploy.sh b/script/docker/app/deploy.sh index 803e3e14..5f3a1d6e 100644 --- a/script/docker/app/deploy.sh +++ b/script/docker/app/deploy.sh @@ -1,7 +1,7 @@ #使用说明,用来提示输入参数 usage() { - echo "Usage: sh 执行脚本.sh [port|mount|base|monitor|modules|prometheus|stop|rm|rmiNoneTag]" + echo "Usage: sh 执行脚本.sh [port|mount|base|monitor|modules|prometheus|alertmanager|stop|rm|rmiNoneTag]" exit 1 } @@ -57,13 +57,30 @@ mount(){ mkdir -p /docker/prometheus cp prometheus/config/prometheus.yml /docker/prometheus/prometheus.yml fi + if test ! -f "/docker/prometheus/rules/alert_rules.yml" ;then + mkdir -p /docker/prometheus/rules + cp prometheus/config/alert_rules.yml /docker/prometheus/rules/alert_rules.yml + fi if test ! -f "/docker/grafana/grafana.ini" ;then mkdir -p /docker/grafana cp prometheus/config/grafana.ini /docker/grafana/grafana.ini fi + if test ! -f "/docker/alertmanager/alertmanager.yml" ;then + mkdir -p /docker/alertmanager + cp prometheus/config/alertmanager.yml /docker/alertmanager/alertmanager.yml + fi + if test ! -f "/docker/alertmanager/templates/wechat.tmpl" ;then + mkdir -p /docker/alertmanager/templates + cp prometheus/config/wechat.tmpl /docker/alertmanager/templates/wechat.tmpl + fi + if test ! -f "/docker/webhook_dingtalk/dingtalk.yml" ;then + mkdir -p /docker/webhook_dingtalk + cp prometheus/config/dingtalk.yml /docker/webhook_dingtalk/dingtalk.yml + fi #增加目录权限 chmod -R 777 /docker/prometheus chmod -R 777 /docker/grafana + chmod -R 777 /docker/alertmanager } #启动基础模块 @@ -86,6 +103,11 @@ prometheus(){ docker-compose up -d prometheus node-exporter mysqld-exporter cadvisor grafana } +#启动监听模块 +alertmanager(){ + docker-compose up -d alertmanager webhook-dingtalk +} + #关闭所有模块 stop(){ docker-compose stop @@ -121,6 +143,9 @@ case "$1" in "prometheus") prometheus ;; +"alertmanager") + alertmanager +;; "stop") stop ;; diff --git a/script/docker/app/docker-compose.yml b/script/docker/app/docker-compose.yml index 6e050c4b..92b67f3d 100644 --- a/script/docker/app/docker-compose.yml +++ b/script/docker/app/docker-compose.yml @@ -255,6 +255,7 @@ services: - 9090:9090 volumes: - /docker/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + - /docker/prometheus/rules:/etc/prometheus/rules command: "--config.file=/etc/prometheus/prometheus.yml --web.enable-lifecycle" privileged: true restart: always @@ -333,6 +334,38 @@ services: blade_net: ipv4_address: 172.30.0.30 + alertmanager: + image: prom/alertmanager:v0.21.0 + hostname: "alertmanager" + environment: + - TZ=Asia/Shanghai + ports: + - 9093:9093 + volumes: + - /docker/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml + - /docker/alertmanager/data:/etc/alertmanager/data + - /docker/alertmanager/templates:/etc/alertmanager/templates + command: "--config.file=/etc/alertmanager/alertmanager.yml --storage.path=/etc/alertmanager/data" + privileged: true + restart: always + networks: + blade_net: + ipv4_address: 172.30.0.99 + + webhook-dingtalk: + image: timonwong/prometheus-webhook-dingtalk:v1.4.0 + hostname: "webhook-dingtalk" + environment: + - TZ=Asia/Shanghai + ports: + - 8060:8060 + command: "ding.profile=webhook_robot=https://oapi.dingtalk.com/robot/send?access_token=xxxxx" + privileged: true + restart: always + networks: + blade_net: + ipv4_address: 172.30.0.96 + networks: blade_net: driver: bridge diff --git a/script/docker/app/prometheus/config/alert_rules.yml b/script/docker/app/prometheus/config/alert_rules.yml new file mode 100644 index 00000000..db23fdd6 --- /dev/null +++ b/script/docker/app/prometheus/config/alert_rules.yml @@ -0,0 +1,118 @@ +groups: + - name: alert_rules + rules: + - alert: CpuUsageAlertWarning + expr: sum(avg(irate(node_cpu_seconds_total{mode!='idle'}[5m])) without (cpu)) by (instance) > 0.60 + for: 2m + labels: + level: warning + annotations: + summary: "Instance {{ $labels.instance }} CPU usage high" + description: "{{ $labels.instance }} CPU usage above 60% (current value: {{ $value }})" + - alert: CpuUsageAlertSerious + #expr: sum(avg(irate(node_cpu_seconds_total{mode!='idle'}[5m])) without (cpu)) by (instance) > 0.85 + expr: (100 - (avg by (instance) (irate(node_cpu_seconds_total{job=~".*",mode="idle"}[5m])) * 100)) > 85 + for: 3m + labels: + level: serious + annotations: + summary: "Instance {{ $labels.instance }} CPU usage high" + description: "{{ $labels.instance }} CPU usage above 85% (current value: {{ $value }})" + - alert: MemUsageAlertWarning + expr: avg by(instance) ((1 - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes) * 100) > 70 + for: 2m + labels: + level: warning + annotations: + summary: "Instance {{ $labels.instance }} MEM usage high" + description: "{{$labels.instance}}: MEM usage is above 70% (current value is: {{ $value }})" + - alert: MemUsageAlertSerious + expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)/node_memory_MemTotal_bytes > 0.90 + for: 3m + labels: + level: serious + annotations: + summary: "Instance {{ $labels.instance }} MEM usage high" + description: "{{ $labels.instance }} MEM usage above 90% (current value: {{ $value }})" + - alert: DiskUsageAlertWarning + expr: (1 - node_filesystem_free_bytes{fstype!="rootfs",mountpoint!="",mountpoint!~"/(run|var|sys|dev).*"} / node_filesystem_size_bytes) * 100 > 80 + for: 2m + labels: + level: warning + annotations: + summary: "Instance {{ $labels.instance }} Disk usage high" + description: "{{$labels.instance}}: Disk usage is above 80% (current value is: {{ $value }})" + - alert: DiskUsageAlertSerious + expr: (1 - node_filesystem_free_bytes{fstype!="rootfs",mountpoint!="",mountpoint!~"/(run|var|sys|dev).*"} / node_filesystem_size_bytes) * 100 > 90 + for: 3m + labels: + level: serious + annotations: + summary: "Instance {{ $labels.instance }} Disk usage high" + description: "{{$labels.instance}}: Disk usage is above 90% (current value is: {{ $value }})" + - alert: NodeFileDescriptorUsage + expr: avg by (instance) (node_filefd_allocated{} / node_filefd_maximum{}) * 100 > 60 + for: 2m + labels: + level: warning + annotations: + summary: "Instance {{ $labels.instance }} File Descriptor usage high" + description: "{{$labels.instance}}: File Descriptor usage is above 60% (current value is: {{ $value }})" + - alert: NodeLoad15 + expr: avg by (instance) (node_load15{}) > 80 + for: 2m + labels: + level: warning + annotations: + summary: "Instance {{ $labels.instance }} Load15 usage high" + description: "{{$labels.instance}}: Load15 is above 80 (current value is: {{ $value }})" + - alert: NodeAgentStatus + expr: avg by (instance) (up{}) == 0 + for: 2m + labels: + level: warning + annotations: + summary: "{{$labels.instance}}: has been down" + description: "{{$labels.instance}}: Node_Exporter Agent is down (current value is: {{ $value }})" + - alert: NodeProcsBlocked + expr: avg by (instance) (node_procs_blocked{}) > 10 + for: 2m + labels: + level: warning + annotations: + summary: "Instance {{ $labels.instance }} Process Blocked usage high" + description: "{{$labels.instance}}: Node Blocked Procs detected! above 10 (current value is: {{ $value }})" + - alert: NetworkTransmitRate + #expr: avg by (instance) (floor(irate(node_network_transmit_bytes_total{device="ens192"}[2m]) / 1024 / 1024)) > 50 + expr: avg by (instance) (floor(irate(node_network_transmit_bytes_total{}[2m]) / 1024 / 1024 * 8 )) > 40 + for: 1m + labels: + level: warning + annotations: + summary: "Instance {{ $labels.instance }} Network Transmit Rate usage high" + description: "{{$labels.instance}}: Node Transmit Rate (Upload) is above 40Mbps/s (current value is: {{ $value }}Mbps/s)" + - alert: NetworkReceiveRate + #expr: avg by (instance) (floor(irate(node_network_receive_bytes_total{device="ens192"}[2m]) / 1024 / 1024)) > 50 + expr: avg by (instance) (floor(irate(node_network_receive_bytes_total{}[2m]) / 1024 / 1024 * 8 )) > 40 + for: 1m + labels: + level: warning + annotations: + summary: "Instance {{ $labels.instance }} Network Receive Rate usage high" + description: "{{$labels.instance}}: Node Receive Rate (Download) is above 40Mbps/s (current value is: {{ $value }}Mbps/s)" + - alert: DiskReadRate + expr: avg by (instance) (floor(irate(node_disk_read_bytes_total{}[2m]) / 1024 )) > 200 + for: 2m + labels: + level: warning + annotations: + summary: "Instance {{ $labels.instance }} Disk Read Rate usage high" + description: "{{$labels.instance}}: Node Disk Read Rate is above 200KB/s (current value is: {{ $value }}KB/s)" + - alert: DiskWriteRate + expr: avg by (instance) (floor(irate(node_disk_written_bytes_total{}[2m]) / 1024 / 1024 )) > 20 + for: 2m + labels: + level: warning + annotations: + summary: "Instance {{ $labels.instance }} Disk Write Rate usage high" + description: "{{$labels.instance}}: Node Disk Write Rate is above 20MB/s (current value is: {{ $value }}MB/s)" diff --git a/script/docker/app/prometheus/config/alertmanager.yml b/script/docker/app/prometheus/config/alertmanager.yml new file mode 100644 index 00000000..09cfe389 --- /dev/null +++ b/script/docker/app/prometheus/config/alertmanager.yml @@ -0,0 +1,56 @@ +global: + # 在没有报警的情况下声明为已解决的时间 + resolve_timeout: 5m + # 配置邮件发送信息 + smtp_smarthost: 'smtp.163.com:25' + # 邮箱地址 + smtp_from: 'bladejava@163.com' + # 邮箱地址 + smtp_auth_username: 'bladejava@163.com' + # 邮箱授权码,需要自行开启设置,非邮箱密码 + smtp_auth_password: 'xxxxxxxx' + # 邮箱地址 + smtp_hello: 'bladejava@163.com' + smtp_require_tls: false + +templates: + # 告警模板文件 + - "/etc/alertmanager/templates/wechat.tmpl" + +route: + # 接收到告警后到自定义分组 + group_by: ["alertname"] + # 分组创建后初始化等待时长 + group_wait: 10s + # 告警信息发送之前的等待时长 + group_interval: 30s + # 重复报警的间隔时长 + repeat_interval: 5m + # 默认消息接收 + receiver: "wechat" + +receivers: + # 微信 + - name: "wechat" + wechat_configs: + # 是否发送恢复信息 + - send_resolved: true + # 填写应用 AgentId + agent_id: "1000002" + # 填写应用 Secret + api_secret: "jxxxxxxxxxxxxxxxxxxxc" + # 填写企业 ID + corp_id: "wwxxxxxxxxxxx01d" + # 填写接收消息的群体 + to_user: "@all" + # 钉钉 + - name: 'dingtalk' + webhook_configs: + # prometheus-webhook-dingtalk服务的地址 + - url: http://172.30.0.96:8060/dingtalk/webhook_robot/send + send_resolved: true + # 邮件 + - name: 'email' + email_configs: + - to: 'your email' + send_resolved: true diff --git a/script/docker/app/prometheus/config/dingtalk.yml b/script/docker/app/prometheus/config/dingtalk.yml new file mode 100644 index 00000000..9fd668b9 --- /dev/null +++ b/script/docker/app/prometheus/config/dingtalk.yml @@ -0,0 +1,12 @@ +timeout: 5s + +targets: + webhook_robot: + # 钉钉机器人创建后的webhook地址 + url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx + webhook_mention_all: + # 钉钉机器人创建后的webhook地址 + url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx + # 提醒全员 + mention: + all: true diff --git a/script/docker/app/prometheus/config/prometheus.yml b/script/docker/app/prometheus/config/prometheus.yml index 12253f43..b272d0a0 100644 --- a/script/docker/app/prometheus/config/prometheus.yml +++ b/script/docker/app/prometheus/config/prometheus.yml @@ -8,13 +8,11 @@ global: alerting: alertmanagers: - static_configs: - - targets: - # - alertmanager:9093 + - targets: ['172.30.0.99:9093'] # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: - # - "first_rules.yml" - # - "second_rules.yml" + - "rules/alert_rules.yml" # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. @@ -28,13 +26,13 @@ scrape_configs: static_configs: - targets: ['172.30.0.93:9100'] #填写node-exporter的docker服务ip:端口或者宿主机ip:映射的端口 labels: - instance: localhost #实例名称或ip + instance: localhost:linux #实例名称或ip # 监控 mysql - job_name: 'mysql' static_configs: - targets: ['172.30.0.94:9104'] #填写mysqld-exporter的docker服务ip:端口或者宿主机ip:映射的端口 labels: - instance: localhost #实例名称或ip + instance: localhost:mysql #实例名称或ip # 监控 cadvisor - job_name: "docker" static_configs: diff --git a/script/docker/app/prometheus/config/wechat.tmpl b/script/docker/app/prometheus/config/wechat.tmpl new file mode 100644 index 00000000..0af660fd --- /dev/null +++ b/script/docker/app/prometheus/config/wechat.tmpl @@ -0,0 +1,34 @@ +{{ define "wechat.default.message" }} +{{- if gt (len .Alerts.Firing) 0 -}} +{{- range $index, $alert := .Alerts -}} +{{- if eq $index 0 -}} +==========告警通知========== +告警类型: {{ $alert.Labels.alertname }} +告警状态: {{ $alert.Status }} +告警级别: {{ $alert.Labels.level }} +{{- end }} +==========告警详情========== +告警主题: {{ $alert.Annotations.summary }} +告警详情: {{ $alert.Annotations.description }} +故障时间: {{ $alert.StartsAt.Local }} +{{ if gt (len $alert.Labels.instance) 0 -}}故障实例: {{ $alert.Labels.instance }}{{- end -}} +{{- end }} +{{- end }} + +{{- if gt (len .Alerts.Resolved) 0 -}} +{{- range $index, $alert := .Alerts -}} +{{- if eq $index 0 -}} +==========恢复通知========== +告警类型: {{ $alert.Labels.alertname }} +告警状态: {{ $alert.Status }} +告警级别: {{ $alert.Labels.level }} +{{- end }} +==========恢复详情========== +告警主题: {{ $alert.Annotations.summary }} +告警详情: {{ $alert.Annotations.description }} +故障时间: {{ $alert.StartsAt.Local }} +恢复时间: {{ $alert.EndsAt.Local }} +{{ if gt (len $alert.Labels.instance) 0 -}}故障实例: {{ $alert.Labels.instance }}{{- end -}} +{{- end }} +{{- end }} +{{- end }}