常用告警
[root@gtcq-gt-monitor-prometheus-01 rules]# more gt-dwz-monitor.rules
groups:
- name: dwz-gt-monitor
rules:
- alert: "node-Agent告警"
expr: up{job="gt-dwz-node-exporter"} == 0
for: 120s
labels:
severity: "重要"
team: dwz-gt-monitor
alert_type: "Agent告警"
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
annotations:
summary: "{{ $labels.instance }} 已停止采集监控数据 30s!"
description: "{{ $labels.instance }} job {{ $labels.job }} 暴露监控数据已停止."
- alert: "CPU使用率监控"
expr: ceil(100 - sum(increase(node_cpu_seconds_total{job="gt-dwz-node-exporter",mode="idle"}[5m])) by(instance) / sum(increase(node_cpu_seconds_total{job="gt-dwz-node-exporter"}[5m]))
by(instance)*100) > 80
for: 2m
labels:
severity: "重要"
team: bdfb
alert_type: "CPU告警"
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
annotations:
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} CPU使用率过高"
description: "IP:{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}的CPU使用大于80% (当前值: {{ $value }})"
- alert: "磁盘使用率监控"
expr: round((1 - (node_filesystem_avail_bytes{fstype=~"ext3|ext4|xfs|nfs",job="gt-dwz-node-exporter"} / node_filesystem_size_bytes{fstype=~"ext3|ext4|xfs|nfs",job="gt-dwz-node-exporter"
})) * 100) > 80
for: 2m
labels:
severity: "重要"
team: dwz-gt-monitor
alert_type: "Disk告警"
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
annotations:
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"
description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}的{{ $labels.mountpoint }} 分区使用大于80% (当前值: {{ $value }}%)"
- alert: "内存使用率监控"
expr: ceil( (1 - (node_memory_MemAvailable_bytes{job="gt-dwz-node-exporter"} / (node_memory_MemTotal_bytes{job="gt-dwz-node-exporter"})))* 100 ) > 80
for: 2m
labels:
severity: "重要"
team: dwz-gt-monitor
alert_type: "MEM告警"
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
annotations:
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}内存使用率过高"
description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}内存使用大于80% (当前值: {{ $value }})"
- alert: "服务器大法宝CPULoad5"
expr: node_load5{job="gt-dwz-node-exporter"} > 100
for: 2m
labels:
severity: "重要"
team: dwz-gt-monitor
alert_type: "负载告警"
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
annotations:
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}CPU负载过高"
description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} CPU负载load大于100 (当前值: {{ $value }})"
- alert: "服务器文件句柄监控"
expr: node_filefd_allocated{job="gt-dwz-node-exporter"} > 50000
for: 2m
labels:
severity: "重要"
team: dwz-gt-monitor
alert_type: "文件句柄告警"
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
annotations:
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 文件句柄使用过高"
description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 文件句柄使用过高大于50000 (当前值: {{ $value }})"
- alert: "服务器TCP连接数监控"
expr: node_sockstat_TCP_tw{job="gt-dwz-node-exporter"} > 15000
for: 2m
labels:
severity: "重要"
team: dwz-gt-monitor
alert_type: "TCP连接数告警"
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
annotations:
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 等待关闭的TCP连接数过高"
description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 等待关闭的TCP连接数TIME_WAIT过高大于15000 (当前值: {{ $value }})"
- alert: "服务器入口流量监控"
expr: round((sum by (instance) (irate(node_network_receive_bytes_total{job="gt-dwz-node-exporter",device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])))/1024/1024) > 50
for: 2m
labels:
severity: "重要"
team: dwz-gt-monitor
alert_type: "流量告警"
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
annotations:
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}监控入口流量过高"
description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 监控入口流量过高过高大于50MB (告警值: {{ $value }}MB)"
- alert: "服务器出口流量监控"
expr: round((sum by (instance) (irate(node_network_transmit_bytes_total{job="gt-dwz-node-exporter",device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])))/1024/1024) > 50
for: 2m
labels:
severity: "重要"
team: dwz-gt-monitor
alert_type: "流量告警"
alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
annotations:
summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 监控出口流量过高"
description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 监控出口流量过高过高大于50MB (告警值: {{ $value }}MB)"
[root@gtcq-gt-monitor-prometheus-01 rules]#