From 701f8c1ac49797ff21c9c1d163b8818df382c111 Mon Sep 17 00:00:00 2001 From: smallchill Date: Sun, 28 Feb 2021 18:25:41 +0800 Subject: [PATCH] :tada: 2.8.0.RELEASE --- .../launcher/DemoLauncherServiceImpl.java | 4 +- .../app/prometheus/config/alert_rules.yml | 118 + .../app/prometheus/config/alertmanager.yml | 56 + .../docker/app/prometheus/config/dingtalk.yml | 12 + .../docker/app/prometheus/config/grafana.ini | 849 +++ .../app/prometheus/config/prometheus.yml | 68 + .../docker/app/prometheus/config/wechat.tmpl | 34 + .../prometheus/dashboard/bladex-docker.json | 1705 ++++++ .../app/prometheus/dashboard/bladex-jvm.json | 3825 +++++++++++++ .../prometheus/dashboard/bladex-linux.json | 3841 +++++++++++++ .../prometheus/dashboard/bladex-mysql.json | 5055 +++++++++++++++++ .../prometheus/dashboard/bladex-nacos.json | 4539 +++++++++++++++ 12 files changed, 20104 insertions(+), 2 deletions(-) create mode 100644 script/docker/app/prometheus/config/alert_rules.yml create mode 100644 script/docker/app/prometheus/config/alertmanager.yml create mode 100644 script/docker/app/prometheus/config/dingtalk.yml create mode 100644 script/docker/app/prometheus/config/grafana.ini create mode 100644 script/docker/app/prometheus/config/prometheus.yml create mode 100644 script/docker/app/prometheus/config/wechat.tmpl create mode 100644 script/docker/app/prometheus/dashboard/bladex-docker.json create mode 100644 script/docker/app/prometheus/dashboard/bladex-jvm.json create mode 100644 script/docker/app/prometheus/dashboard/bladex-linux.json create mode 100644 script/docker/app/prometheus/dashboard/bladex-mysql.json create mode 100644 script/docker/app/prometheus/dashboard/bladex-nacos.json diff --git a/blade-service/blade-demo/src/main/java/com/example/demo/launcher/DemoLauncherServiceImpl.java b/blade-service/blade-demo/src/main/java/com/example/demo/launcher/DemoLauncherServiceImpl.java index beabbb90a..1bac76c0b 100644 --- a/blade-service/blade-demo/src/main/java/com/example/demo/launcher/DemoLauncherServiceImpl.java +++ b/blade-service/blade-demo/src/main/java/com/example/demo/launcher/DemoLauncherServiceImpl.java @@ -44,8 +44,8 @@ public class DemoLauncherServiceImpl implements LauncherService { // PropsUtil.setProperty(props, "spring.cloud.nacos.config.namespace", LauncherConstant.NACOS_NAMESPACE); // PropsUtil.setProperty(props, "spring.cloud.nacos.discovery.namespace", LauncherConstant.NACOS_NAMESPACE); // 自定义分组 - // PropsUtil.setProperty(props, "spring.cloud.nacos.config.group", NacosConstant.NACOS_CONFIG_GROUP); - // PropsUtil.setProperty(props, "spring.cloud.nacos.discovery.group", NacosConstant.NACOS_CONFIG_GROUP); + // PropsUtil.setProperty(props, "spring.cloud.nacos.config.group", NacosConstant.NACOS_CONFIG_GROUP); + // PropsUtil.setProperty(props, "spring.cloud.nacos.discovery.group", NacosConstant.NACOS_CONFIG_GROUP); } @Override diff --git a/script/docker/app/prometheus/config/alert_rules.yml b/script/docker/app/prometheus/config/alert_rules.yml new file mode 100644 index 000000000..db23fdd6d --- /dev/null +++ b/script/docker/app/prometheus/config/alert_rules.yml @@ -0,0 +1,118 @@ +groups: + - name: alert_rules + rules: + - alert: CpuUsageAlertWarning + expr: sum(avg(irate(node_cpu_seconds_total{mode!='idle'}[5m])) without (cpu)) by (instance) > 0.60 + for: 2m + labels: + level: warning + annotations: + summary: "Instance {{ $labels.instance }} CPU usage high" + description: "{{ $labels.instance }} CPU usage above 60% (current value: {{ $value }})" + - alert: CpuUsageAlertSerious + #expr: sum(avg(irate(node_cpu_seconds_total{mode!='idle'}[5m])) without (cpu)) by (instance) > 0.85 + expr: (100 - (avg by (instance) (irate(node_cpu_seconds_total{job=~".*",mode="idle"}[5m])) * 100)) > 85 + for: 3m + labels: + level: serious + annotations: + summary: "Instance {{ $labels.instance }} CPU usage high" + description: "{{ $labels.instance }} CPU usage above 85% (current value: {{ $value }})" + - alert: MemUsageAlertWarning + expr: avg by(instance) ((1 - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes) * 100) > 70 + for: 2m + labels: + level: warning + annotations: + summary: "Instance {{ $labels.instance }} MEM usage high" + description: "{{$labels.instance}}: MEM usage is above 70% (current value is: {{ $value }})" + - alert: MemUsageAlertSerious + expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)/node_memory_MemTotal_bytes > 0.90 + for: 3m + labels: + level: serious + annotations: + summary: "Instance {{ $labels.instance }} MEM usage high" + description: "{{ $labels.instance }} MEM usage above 90% (current value: {{ $value }})" + - alert: DiskUsageAlertWarning + expr: (1 - node_filesystem_free_bytes{fstype!="rootfs",mountpoint!="",mountpoint!~"/(run|var|sys|dev).*"} / node_filesystem_size_bytes) * 100 > 80 + for: 2m + labels: + level: warning + annotations: + summary: "Instance {{ $labels.instance }} Disk usage high" + description: "{{$labels.instance}}: Disk usage is above 80% (current value is: {{ $value }})" + - alert: DiskUsageAlertSerious + expr: (1 - node_filesystem_free_bytes{fstype!="rootfs",mountpoint!="",mountpoint!~"/(run|var|sys|dev).*"} / node_filesystem_size_bytes) * 100 > 90 + for: 3m + labels: + level: serious + annotations: + summary: "Instance {{ $labels.instance }} Disk usage high" + description: "{{$labels.instance}}: Disk usage is above 90% (current value is: {{ $value }})" + - alert: NodeFileDescriptorUsage + expr: avg by (instance) (node_filefd_allocated{} / node_filefd_maximum{}) * 100 > 60 + for: 2m + labels: + level: warning + annotations: + summary: "Instance {{ $labels.instance }} File Descriptor usage high" + description: "{{$labels.instance}}: File Descriptor usage is above 60% (current value is: {{ $value }})" + - alert: NodeLoad15 + expr: avg by (instance) (node_load15{}) > 80 + for: 2m + labels: + level: warning + annotations: + summary: "Instance {{ $labels.instance }} Load15 usage high" + description: "{{$labels.instance}}: Load15 is above 80 (current value is: {{ $value }})" + - alert: NodeAgentStatus + expr: avg by (instance) (up{}) == 0 + for: 2m + labels: + level: warning + annotations: + summary: "{{$labels.instance}}: has been down" + description: "{{$labels.instance}}: Node_Exporter Agent is down (current value is: {{ $value }})" + - alert: NodeProcsBlocked + expr: avg by (instance) (node_procs_blocked{}) > 10 + for: 2m + labels: + level: warning + annotations: + summary: "Instance {{ $labels.instance }} Process Blocked usage high" + description: "{{$labels.instance}}: Node Blocked Procs detected! above 10 (current value is: {{ $value }})" + - alert: NetworkTransmitRate + #expr: avg by (instance) (floor(irate(node_network_transmit_bytes_total{device="ens192"}[2m]) / 1024 / 1024)) > 50 + expr: avg by (instance) (floor(irate(node_network_transmit_bytes_total{}[2m]) / 1024 / 1024 * 8 )) > 40 + for: 1m + labels: + level: warning + annotations: + summary: "Instance {{ $labels.instance }} Network Transmit Rate usage high" + description: "{{$labels.instance}}: Node Transmit Rate (Upload) is above 40Mbps/s (current value is: {{ $value }}Mbps/s)" + - alert: NetworkReceiveRate + #expr: avg by (instance) (floor(irate(node_network_receive_bytes_total{device="ens192"}[2m]) / 1024 / 1024)) > 50 + expr: avg by (instance) (floor(irate(node_network_receive_bytes_total{}[2m]) / 1024 / 1024 * 8 )) > 40 + for: 1m + labels: + level: warning + annotations: + summary: "Instance {{ $labels.instance }} Network Receive Rate usage high" + description: "{{$labels.instance}}: Node Receive Rate (Download) is above 40Mbps/s (current value is: {{ $value }}Mbps/s)" + - alert: DiskReadRate + expr: avg by (instance) (floor(irate(node_disk_read_bytes_total{}[2m]) / 1024 )) > 200 + for: 2m + labels: + level: warning + annotations: + summary: "Instance {{ $labels.instance }} Disk Read Rate usage high" + description: "{{$labels.instance}}: Node Disk Read Rate is above 200KB/s (current value is: {{ $value }}KB/s)" + - alert: DiskWriteRate + expr: avg by (instance) (floor(irate(node_disk_written_bytes_total{}[2m]) / 1024 / 1024 )) > 20 + for: 2m + labels: + level: warning + annotations: + summary: "Instance {{ $labels.instance }} Disk Write Rate usage high" + description: "{{$labels.instance}}: Node Disk Write Rate is above 20MB/s (current value is: {{ $value }}MB/s)" diff --git a/script/docker/app/prometheus/config/alertmanager.yml b/script/docker/app/prometheus/config/alertmanager.yml new file mode 100644 index 000000000..09cfe389f --- /dev/null +++ b/script/docker/app/prometheus/config/alertmanager.yml @@ -0,0 +1,56 @@ +global: + # 在没有报警的情况下声明为已解决的时间 + resolve_timeout: 5m + # 配置邮件发送信息 + smtp_smarthost: 'smtp.163.com:25' + # 邮箱地址 + smtp_from: 'bladejava@163.com' + # 邮箱地址 + smtp_auth_username: 'bladejava@163.com' + # 邮箱授权码,需要自行开启设置,非邮箱密码 + smtp_auth_password: 'xxxxxxxx' + # 邮箱地址 + smtp_hello: 'bladejava@163.com' + smtp_require_tls: false + +templates: + # 告警模板文件 + - "/etc/alertmanager/templates/wechat.tmpl" + +route: + # 接收到告警后到自定义分组 + group_by: ["alertname"] + # 分组创建后初始化等待时长 + group_wait: 10s + # 告警信息发送之前的等待时长 + group_interval: 30s + # 重复报警的间隔时长 + repeat_interval: 5m + # 默认消息接收 + receiver: "wechat" + +receivers: + # 微信 + - name: "wechat" + wechat_configs: + # 是否发送恢复信息 + - send_resolved: true + # 填写应用 AgentId + agent_id: "1000002" + # 填写应用 Secret + api_secret: "jxxxxxxxxxxxxxxxxxxxc" + # 填写企业 ID + corp_id: "wwxxxxxxxxxxx01d" + # 填写接收消息的群体 + to_user: "@all" + # 钉钉 + - name: 'dingtalk' + webhook_configs: + # prometheus-webhook-dingtalk服务的地址 + - url: http://172.30.0.96:8060/dingtalk/webhook_robot/send + send_resolved: true + # 邮件 + - name: 'email' + email_configs: + - to: 'your email' + send_resolved: true diff --git a/script/docker/app/prometheus/config/dingtalk.yml b/script/docker/app/prometheus/config/dingtalk.yml new file mode 100644 index 000000000..9fd668b94 --- /dev/null +++ b/script/docker/app/prometheus/config/dingtalk.yml @@ -0,0 +1,12 @@ +timeout: 5s + +targets: + webhook_robot: + # 钉钉机器人创建后的webhook地址 + url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx + webhook_mention_all: + # 钉钉机器人创建后的webhook地址 + url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx + # 提醒全员 + mention: + all: true diff --git a/script/docker/app/prometheus/config/grafana.ini b/script/docker/app/prometheus/config/grafana.ini new file mode 100644 index 000000000..b1e5ac0a7 --- /dev/null +++ b/script/docker/app/prometheus/config/grafana.ini @@ -0,0 +1,849 @@ +##################### Grafana Configuration Example ##################### +# +# Everything has defaults so you only need to uncomment things you want to +# change + +# possible values : production, development +;app_mode = production + +# instance name, defaults to HOSTNAME environment variable value or hostname if HOSTNAME var is empty +;instance_name = ${HOSTNAME} + +#################################### Paths #################################### +[paths] +# Path to where grafana can store temp files, sessions, and the sqlite3 db (if that is used) +;data = /var/lib/grafana + +# Temporary files in `data` directory older than given duration will be removed +;temp_data_lifetime = 24h + +# Directory where grafana can store logs +;logs = /var/log/grafana + +# Directory where grafana will automatically scan and look for plugins +;plugins = /var/lib/grafana/plugins + +# folder that contains provisioning config files that grafana will apply on startup and while running. +;provisioning = conf/provisioning + +#################################### Server #################################### +[server] +# Protocol (http, https, h2, socket) +;protocol = http + +# The ip address to bind to, empty will bind to all interfaces +;http_addr = + +# The http port to use +;http_port = 3000 + +# The public facing domain name used to access grafana from a browser +;domain = localhost + +# Redirect to correct domain if host header does not match domain +# Prevents DNS rebinding attacks +;enforce_domain = false + +# The full public facing url you use in browser, used for redirects and emails +# If you use reverse proxy and sub path specify full url (with sub path) +;root_url = %(protocol)s://%(domain)s:%(http_port)s/ + +# Serve Grafana from subpath specified in `root_url` setting. By default it is set to `false` for compatibility reasons. +;serve_from_sub_path = false + +# Log web requests +;router_logging = false + +# the path relative working path +;static_root_path = public + +# enable gzip +;enable_gzip = false + +# https certs & key file +;cert_file = +;cert_key = + +# Unix socket path +;socket = + +#################################### Database #################################### +[database] +# You can configure the database connection by specifying type, host, name, user and password +# as separate properties or as on string using the url properties. + +# Either "mysql", "postgres" or "sqlite3", it's your choice +;type = sqlite3 +;host = 127.0.0.1:3306 +;name = grafana +;user = root +# If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;""" +;password = + +# Use either URL or the previous fields to configure the database +# Example: mysql://user:secret@host:port/database +;url = + +# For "postgres" only, either "disable", "require" or "verify-full" +;ssl_mode = disable + +;ca_cert_path = +;client_key_path = +;client_cert_path = +;server_cert_name = + +# For "sqlite3" only, path relative to data_path setting +;path = grafana.db + +# Max idle conn setting default is 2 +;max_idle_conn = 2 + +# Max conn setting default is 0 (mean not set) +;max_open_conn = + +# Connection Max Lifetime default is 14400 (means 14400 seconds or 4 hours) +;conn_max_lifetime = 14400 + +# Set to true to log the sql calls and execution times. +;log_queries = + +# For "sqlite3" only. cache mode setting used for connecting to the database. (private, shared) +;cache_mode = private + +#################################### Cache server ############################# +[remote_cache] +# Either "redis", "memcached" or "database" default is "database" +;type = database + +# cache connectionstring options +# database: will use Grafana primary database. +# redis: config like redis server e.g. `addr=127.0.0.1:6379,pool_size=100,db=0,ssl=false`. Only addr is required. ssl may be 'true', 'false', or 'insecure'. +# memcache: 127.0.0.1:11211 +;connstr = + +#################################### Data proxy ########################### +[dataproxy] + +# This enables data proxy logging, default is false +;logging = false + +# How long the data proxy waits before timing out, default is 30 seconds. +# This setting also applies to core backend HTTP data sources where query requests use an HTTP client with timeout set. +;timeout = 30 + +# How many seconds the data proxy waits before sending a keepalive probe request. +;keep_alive_seconds = 30 + +# How many seconds the data proxy waits for a successful TLS Handshake before timing out. +;tls_handshake_timeout_seconds = 10 + +# How many seconds the data proxy will wait for a server's first response headers after +# fully writing the request headers if the request has an "Expect: 100-continue" +# header. A value of 0 will result in the body being sent immediately, without +# waiting for the server to approve. +;expect_continue_timeout_seconds = 1 + +# The maximum number of idle connections that Grafana will keep alive. +;max_idle_connections = 100 + +# How many seconds the data proxy keeps an idle connection open before timing out. +;idle_conn_timeout_seconds = 90 + +# If enabled and user is not anonymous, data proxy will add X-Grafana-User header with username into the request, default is false. +;send_user_header = false + +#################################### Analytics #################################### +[analytics] +# Server reporting, sends usage counters to stats.grafana.org every 24 hours. +# No ip addresses are being tracked, only simple counters to track +# running instances, dashboard and error counts. It is very helpful to us. +# Change this option to false to disable reporting. +;reporting_enabled = true + +# Set to false to disable all checks to https://grafana.net +# for new versions (grafana itself and plugins), check is used +# in some UI views to notify that grafana or plugin update exists +# This option does not cause any auto updates, nor send any information +# only a GET request to http://grafana.com to get latest versions +;check_for_updates = true + +# Google Analytics universal tracking code, only enabled if you specify an id here +;google_analytics_ua_id = + +# Google Tag Manager ID, only enabled if you specify an id here +;google_tag_manager_id = + +#################################### Security #################################### +[security] +# disable creation of admin user on first start of grafana +;disable_initial_admin_creation = false + +# default admin user, created on startup +;admin_user = admin + +# default admin password, can be changed before first start of grafana, or in profile settings +;admin_password = admin + +# used for signing +;secret_key = SW2YcwTIb9zpOOhoPsMm + +# disable gravatar profile images +;disable_gravatar = false + +# data source proxy whitelist (ip_or_domain:port separated by spaces) +;data_source_proxy_whitelist = + +# disable protection against brute force login attempts +;disable_brute_force_login_protection = false + +# set to true if you host Grafana behind HTTPS. default is false. +;cookie_secure = false + +# set cookie SameSite attribute. defaults to `lax`. can be set to "lax", "strict", "none" and "disabled" +;cookie_samesite = lax + +# set to true if you want to allow browsers to render Grafana in a ,