Documentation

 
# /etc/prometheus/alert.rules.yml
groups:
  - name: system-alerts
    rules:
 
      # High CPU Usage Alert
      - alert: HighCPUUsage
        expr: (100 - avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High CPU Usage Detected on {{ $labels.instance }}"
          description: "CPU usage is above 90% for more than 2 minutes on instance {{ $labels.instance }}."
 
      # High Memory Usage Alert
      - alert: HighMemoryUsage
        expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 90
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "High Memory Usage Detected on {{ $labels.instance }}"
          description: "Memory usage is above 90% for more than 2 minutes on instance {{ $labels.instance }}."
 
      # Disk Storage Full Alert
      - alert: DiskStorageFull
        expr: (node_filesystem_size_bytes{fstype!~"tmpfs|fuse.lxcfs"} - node_filesystem_free_bytes{fstype!~"tmpfs|fuse.lxcfs"}) / node_filesystem_size_bytes{fstype!~"tmpfs|fuse.lxcfs"} * 100 > 90
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Disk Storage Nearly Full on {{ $labels.instance }}"
          description: "Disk storage is above 90% full for more than 2 minutes on instance {{ $labels.instance }}."
 
      # Instance Down Alert
      - alert: InstanceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Instance Down Alert: {{ $labels.instance }} + Instance Name : {{ $labels.job }}"
          description: "Instance {{ $labels.instance }} is down for more than 1 minute."
 
      # High Load Average Alert
      - alert: HighLoadAverage
        expr: node_load1 / count(node_cpu_seconds_total{mode="system"}) * 100 > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High Load Average on {{ $labels.instance }}"
          description: "Load average is above 80% of CPU count for more than 5 minutes on instance {{ $labels.instance }}."
 
      # High Disk I/O Wait Alert
      - alert: HighDiskIOWait
        expr: avg by(instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > 0.25
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High Disk I/O Wait on {{ $labels.instance }}"
          description: "Disk I/O wait is high for more than 5 minutes on instance {{ $labels.instance }}."
 
      # High Network Inbound Traffic Alert
      - alert: HighNetworkIn
        expr: rate(node_network_receive_bytes_total[5m]) > (100 * 1024 * 1024)  # Adjust based on instance bandwidth capacity
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "High Network Inbound Traffic on {{ $labels.instance }}"
          description: "Inbound network traffic is high on instance {{ $labels.instance }}."
 
      # High Network Outbound Traffic Alert
      - alert: HighNetworkOut
        expr: rate(node_network_transmit_bytes_total[5m]) > (100 * 1024 * 1024)  # Adjust based on instance bandwidth capacity
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "High Network Outbound Traffic on {{ $labels.instance }}"
          description: "Outbound network traffic is high on instance {{ $labels.instance }}."
Alert Rule Prom.yaml Prom.yml