7 changed files with 283 additions and 7 deletions
@ -0,0 +1,118 @@
|
||||
groups: |
||||
- name: alert_rules |
||||
rules: |
||||
- alert: CpuUsageAlertWarning |
||||
expr: sum(avg(irate(node_cpu_seconds_total{mode!='idle'}[5m])) without (cpu)) by (instance) > 0.60 |
||||
for: 2m |
||||
labels: |
||||
level: warning |
||||
annotations: |
||||
summary: "Instance {{ $labels.instance }} CPU usage high" |
||||
description: "{{ $labels.instance }} CPU usage above 60% (current value: {{ $value }})" |
||||
- alert: CpuUsageAlertSerious |
||||
#expr: sum(avg(irate(node_cpu_seconds_total{mode!='idle'}[5m])) without (cpu)) by (instance) > 0.85 |
||||
expr: (100 - (avg by (instance) (irate(node_cpu_seconds_total{job=~".*",mode="idle"}[5m])) * 100)) > 85 |
||||
for: 3m |
||||
labels: |
||||
level: serious |
||||
annotations: |
||||
summary: "Instance {{ $labels.instance }} CPU usage high" |
||||
description: "{{ $labels.instance }} CPU usage above 85% (current value: {{ $value }})" |
||||
- alert: MemUsageAlertWarning |
||||
expr: avg by(instance) ((1 - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes) * 100) > 70 |
||||
for: 2m |
||||
labels: |
||||
level: warning |
||||
annotations: |
||||
summary: "Instance {{ $labels.instance }} MEM usage high" |
||||
description: "{{$labels.instance}}: MEM usage is above 70% (current value is: {{ $value }})" |
||||
- alert: MemUsageAlertSerious |
||||
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)/node_memory_MemTotal_bytes > 0.90 |
||||
for: 3m |
||||
labels: |
||||
level: serious |
||||
annotations: |
||||
summary: "Instance {{ $labels.instance }} MEM usage high" |
||||
description: "{{ $labels.instance }} MEM usage above 90% (current value: {{ $value }})" |
||||
- alert: DiskUsageAlertWarning |
||||
expr: (1 - node_filesystem_free_bytes{fstype!="rootfs",mountpoint!="",mountpoint!~"/(run|var|sys|dev).*"} / node_filesystem_size_bytes) * 100 > 80 |
||||
for: 2m |
||||
labels: |
||||
level: warning |
||||
annotations: |
||||
summary: "Instance {{ $labels.instance }} Disk usage high" |
||||
description: "{{$labels.instance}}: Disk usage is above 80% (current value is: {{ $value }})" |
||||
- alert: DiskUsageAlertSerious |
||||
expr: (1 - node_filesystem_free_bytes{fstype!="rootfs",mountpoint!="",mountpoint!~"/(run|var|sys|dev).*"} / node_filesystem_size_bytes) * 100 > 90 |
||||
for: 3m |
||||
labels: |
||||
level: serious |
||||
annotations: |
||||
summary: "Instance {{ $labels.instance }} Disk usage high" |
||||
description: "{{$labels.instance}}: Disk usage is above 90% (current value is: {{ $value }})" |
||||
- alert: NodeFileDescriptorUsage |
||||
expr: avg by (instance) (node_filefd_allocated{} / node_filefd_maximum{}) * 100 > 60 |
||||
for: 2m |
||||
labels: |
||||
level: warning |
||||
annotations: |
||||
summary: "Instance {{ $labels.instance }} File Descriptor usage high" |
||||
description: "{{$labels.instance}}: File Descriptor usage is above 60% (current value is: {{ $value }})" |
||||
- alert: NodeLoad15 |
||||
expr: avg by (instance) (node_load15{}) > 80 |
||||
for: 2m |
||||
labels: |
||||
level: warning |
||||
annotations: |
||||
summary: "Instance {{ $labels.instance }} Load15 usage high" |
||||
description: "{{$labels.instance}}: Load15 is above 80 (current value is: {{ $value }})" |
||||
- alert: NodeAgentStatus |
||||
expr: avg by (instance) (up{}) == 0 |
||||
for: 2m |
||||
labels: |
||||
level: warning |
||||
annotations: |
||||
summary: "{{$labels.instance}}: has been down" |
||||
description: "{{$labels.instance}}: Node_Exporter Agent is down (current value is: {{ $value }})" |
||||
- alert: NodeProcsBlocked |
||||
expr: avg by (instance) (node_procs_blocked{}) > 10 |
||||
for: 2m |
||||
labels: |
||||
level: warning |
||||
annotations: |
||||
summary: "Instance {{ $labels.instance }} Process Blocked usage high" |
||||
description: "{{$labels.instance}}: Node Blocked Procs detected! above 10 (current value is: {{ $value }})" |
||||
- alert: NetworkTransmitRate |
||||
#expr: avg by (instance) (floor(irate(node_network_transmit_bytes_total{device="ens192"}[2m]) / 1024 / 1024)) > 50 |
||||
expr: avg by (instance) (floor(irate(node_network_transmit_bytes_total{}[2m]) / 1024 / 1024 * 8 )) > 40 |
||||
for: 1m |
||||
labels: |
||||
level: warning |
||||
annotations: |
||||
summary: "Instance {{ $labels.instance }} Network Transmit Rate usage high" |
||||
description: "{{$labels.instance}}: Node Transmit Rate (Upload) is above 40Mbps/s (current value is: {{ $value }}Mbps/s)" |
||||
- alert: NetworkReceiveRate |
||||
#expr: avg by (instance) (floor(irate(node_network_receive_bytes_total{device="ens192"}[2m]) / 1024 / 1024)) > 50 |
||||
expr: avg by (instance) (floor(irate(node_network_receive_bytes_total{}[2m]) / 1024 / 1024 * 8 )) > 40 |
||||
for: 1m |
||||
labels: |
||||
level: warning |
||||
annotations: |
||||
summary: "Instance {{ $labels.instance }} Network Receive Rate usage high" |
||||
description: "{{$labels.instance}}: Node Receive Rate (Download) is above 40Mbps/s (current value is: {{ $value }}Mbps/s)" |
||||
- alert: DiskReadRate |
||||
expr: avg by (instance) (floor(irate(node_disk_read_bytes_total{}[2m]) / 1024 )) > 200 |
||||
for: 2m |
||||
labels: |
||||
level: warning |
||||
annotations: |
||||
summary: "Instance {{ $labels.instance }} Disk Read Rate usage high" |
||||
description: "{{$labels.instance}}: Node Disk Read Rate is above 200KB/s (current value is: {{ $value }}KB/s)" |
||||
- alert: DiskWriteRate |
||||
expr: avg by (instance) (floor(irate(node_disk_written_bytes_total{}[2m]) / 1024 / 1024 )) > 20 |
||||
for: 2m |
||||
labels: |
||||
level: warning |
||||
annotations: |
||||
summary: "Instance {{ $labels.instance }} Disk Write Rate usage high" |
||||
description: "{{$labels.instance}}: Node Disk Write Rate is above 20MB/s (current value is: {{ $value }}MB/s)" |
@ -0,0 +1,56 @@
|
||||
global: |
||||
# 在没有报警的情况下声明为已解决的时间 |
||||
resolve_timeout: 5m |
||||
# 配置邮件发送信息 |
||||
smtp_smarthost: 'smtp.163.com:25' |
||||
# 邮箱地址 |
||||
smtp_from: 'bladejava@163.com' |
||||
# 邮箱地址 |
||||
smtp_auth_username: 'bladejava@163.com' |
||||
# 邮箱授权码,需要自行开启设置,非邮箱密码 |
||||
smtp_auth_password: 'xxxxxxxx' |
||||
# 邮箱地址 |
||||
smtp_hello: 'bladejava@163.com' |
||||
smtp_require_tls: false |
||||
|
||||
templates: |
||||
# 告警模板文件 |
||||
- "/etc/alertmanager/templates/wechat.tmpl" |
||||
|
||||
route: |
||||
# 接收到告警后到自定义分组 |
||||
group_by: ["alertname"] |
||||
# 分组创建后初始化等待时长 |
||||
group_wait: 10s |
||||
# 告警信息发送之前的等待时长 |
||||
group_interval: 30s |
||||
# 重复报警的间隔时长 |
||||
repeat_interval: 5m |
||||
# 默认消息接收 |
||||
receiver: "wechat" |
||||
|
||||
receivers: |
||||
# 微信 |
||||
- name: "wechat" |
||||
wechat_configs: |
||||
# 是否发送恢复信息 |
||||
- send_resolved: true |
||||
# 填写应用 AgentId |
||||
agent_id: "1000002" |
||||
# 填写应用 Secret |
||||
api_secret: "jxxxxxxxxxxxxxxxxxxxc" |
||||
# 填写企业 ID |
||||
corp_id: "wwxxxxxxxxxxx01d" |
||||
# 填写接收消息的群体 |
||||
to_user: "@all" |
||||
# 钉钉 |
||||
- name: 'dingtalk' |
||||
webhook_configs: |
||||
# prometheus-webhook-dingtalk服务的地址 |
||||
- url: http://172.30.0.96:8060/dingtalk/webhook_robot/send |
||||
send_resolved: true |
||||
# 邮件 |
||||
- name: 'email' |
||||
email_configs: |
||||
- to: 'your email' |
||||
send_resolved: true |
@ -0,0 +1,12 @@
|
||||
timeout: 5s |
||||
|
||||
targets: |
||||
webhook_robot: |
||||
# 钉钉机器人创建后的webhook地址 |
||||
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx |
||||
webhook_mention_all: |
||||
# 钉钉机器人创建后的webhook地址 |
||||
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx |
||||
# 提醒全员 |
||||
mention: |
||||
all: true |
@ -0,0 +1,34 @@
|
||||
{{ define "wechat.default.message" }} |
||||
{{- if gt (len .Alerts.Firing) 0 -}} |
||||
{{- range $index, $alert := .Alerts -}} |
||||
{{- if eq $index 0 -}} |
||||
==========告警通知========== |
||||
告警类型: {{ $alert.Labels.alertname }} |
||||
告警状态: {{ $alert.Status }} |
||||
告警级别: {{ $alert.Labels.level }} |
||||
{{- end }} |
||||
==========告警详情========== |
||||
告警主题: {{ $alert.Annotations.summary }} |
||||
告警详情: {{ $alert.Annotations.description }} |
||||
故障时间: {{ $alert.StartsAt.Local }} |
||||
{{ if gt (len $alert.Labels.instance) 0 -}}故障实例: {{ $alert.Labels.instance }}{{- end -}} |
||||
{{- end }} |
||||
{{- end }} |
||||
|
||||
{{- if gt (len .Alerts.Resolved) 0 -}} |
||||
{{- range $index, $alert := .Alerts -}} |
||||
{{- if eq $index 0 -}} |
||||
==========恢复通知========== |
||||
告警类型: {{ $alert.Labels.alertname }} |
||||
告警状态: {{ $alert.Status }} |
||||
告警级别: {{ $alert.Labels.level }} |
||||
{{- end }} |
||||
==========恢复详情========== |
||||
告警主题: {{ $alert.Annotations.summary }} |
||||
告警详情: {{ $alert.Annotations.description }} |
||||
故障时间: {{ $alert.StartsAt.Local }} |
||||
恢复时间: {{ $alert.EndsAt.Local }} |
||||
{{ if gt (len $alert.Labels.instance) 0 -}}故障实例: {{ $alert.Labels.instance }}{{- end -}} |
||||
{{- end }} |
||||
{{- end }} |
||||
{{- end }} |
Loading…
Reference in new issue