常用告警

[root@gtcq-gt-monitor-prometheus-01 rules]# more    gt-dwz-monitor.rules
groups:
- name: dwz-gt-monitor
  rules:
  - alert: "node-Agent告警"
    expr: up{job="gt-dwz-node-exporter"} == 0
    for: 120s
    labels:
      severity: "重要"
      team: dwz-gt-monitor
      alert_type: "Agent告警" 
      alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
    annotations:
      summary: "{{ $labels.instance }} 已停止采集监控数据 30s!"
      description: "{{ $labels.instance }} job {{ $labels.job }} 暴露监控数据已停止."

  - alert: "CPU使用率监控"
    expr: ceil(100 - sum(increase(node_cpu_seconds_total{job="gt-dwz-node-exporter",mode="idle"}[5m]))  by(instance) / sum(increase(node_cpu_seconds_total{job="gt-dwz-node-exporter"}[5m])) 
 by(instance)*100) > 80
    for: 2m
    labels:
      severity: "重要"
      team: bdfb
      alert_type: "CPU告警"
      alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
    annotations:
      summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} CPU使用率过高"
      description: "IP:{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}的CPU使用大于80% (当前值: {{ $value }})"

  - alert: "磁盘使用率监控"
    expr: round((1 - (node_filesystem_avail_bytes{fstype=~"ext3|ext4|xfs|nfs",job="gt-dwz-node-exporter"} / node_filesystem_size_bytes{fstype=~"ext3|ext4|xfs|nfs",job="gt-dwz-node-exporter"
})) * 100)  > 80
    for: 2m
    labels:
      severity: "重要"
      team: dwz-gt-monitor
      alert_type: "Disk告警"
      alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
    annotations:
      summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"
      description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}的{{ $labels.mountpoint }} 分区使用大于80% (当前值: {{ $value }}%)"

  - alert: "内存使用率监控"
    expr: ceil( (1 - (node_memory_MemAvailable_bytes{job="gt-dwz-node-exporter"} / (node_memory_MemTotal_bytes{job="gt-dwz-node-exporter"})))* 100 ) > 80
    for: 2m
    labels:
      severity: "重要"
      team: dwz-gt-monitor
      alert_type: "MEM告警"
      alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
    annotations:
      summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}内存使用率过高"
      description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}内存使用大于80% (当前值: {{ $value }})"

  - alert: "服务器大法宝CPULoad5"
    expr: node_load5{job="gt-dwz-node-exporter"} > 100
    for: 2m
    labels:
      severity: "重要"
      team: dwz-gt-monitor
      alert_type: "负载告警"
      alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
    annotations:
      summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}CPU负载过高"
      description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} CPU负载load大于100 (当前值: {{ $value }})"

  - alert: "服务器文件句柄监控"
    expr: node_filefd_allocated{job="gt-dwz-node-exporter"} > 50000
    for: 2m
    labels:
      severity: "重要"
      team: dwz-gt-monitor
      alert_type: "文件句柄告警"
      alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
    annotations:
      summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 文件句柄使用过高"
      description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 文件句柄使用过高大于50000 (当前值: {{ $value }})"

  - alert: "服务器TCP连接数监控"
    expr: node_sockstat_TCP_tw{job="gt-dwz-node-exporter"} > 15000
    for: 2m
    labels:
      severity: "重要"
      team: dwz-gt-monitor
      alert_type: "TCP连接数告警"
      alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
    annotations:
      summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 等待关闭的TCP连接数过高"
      description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 等待关闭的TCP连接数TIME_WAIT过高大于15000 (当前值: {{ $value }})"

  - alert: "服务器入口流量监控"
    expr: round((sum by (instance)  (irate(node_network_receive_bytes_total{job="gt-dwz-node-exporter",device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])))/1024/1024) > 50
    for: 2m
    labels:
      severity: "重要"
      team: dwz-gt-monitor
      alert_type: "流量告警"
      alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
    annotations:
      summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}监控入口流量过高"
      description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 监控入口流量过高过高大于50MB (告警值: {{ $value }}MB)"

  - alert: "服务器出口流量监控"
    expr: round((sum by (instance)  (irate(node_network_transmit_bytes_total{job="gt-dwz-node-exporter",device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])))/1024/1024) > 50
    for: 2m
    labels:
      severity: "重要"
      team: dwz-gt-monitor
      alert_type: "流量告警"
      alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
    annotations:
      summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 监控出口流量过高"
      description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 监控出口流量过高过高大于50MB (告警值: {{ $value }}MB)"
[root@gtcq-gt-monitor-prometheus-01 rules]#