Documentation
groups:
- name: alert-rules
rules:
# High CPU Usage Alert
- alert: HighCPUUsage
expr: (100 - avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 10m
labels:
severity: warning
environment: prod
annotations:
summary: "High CPU Usage Detected on {{ $labels.instance }}"
description: "CPU usage is above 90% for more than 2 minutes on instance {{ $labels.instance }}."
# High Memory Usage Alert
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 80
for: 5m
labels:
severity: critical
environment: prod
annotations:
summary: "High Memory Usage Detected on {{ $labels.instance }}"
description: "Memory usage is above 90% for more than 2 minutes on instance {{ $labels.instance }}."
# Disk Storage Full Alert 80%
- alert: DiskStorageFull
expr: (node_filesystem_size_bytes{fstype!~"tmpfs|fuse.lxcfs"} - node_filesystem_free_bytes{fstype!~"tmpfs|fuse.lxcfs"}) / node_filesystem_size_bytes{fstype!~"tmpfs|fuse.lxcfs"} * 100 > 90
for: 50m
labels:
severity: warning
environment: prod
annotations:
summary: "Disk Storage Nearly Full on {{ $labels.instance }}"
description: "Disk storage is above 80% full for more than 50 minutes on instance {{ $labels.instance }}."
# Instance Down Alert
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: critical
environment: prod
annotations:
summary: "Instance Down Alert: {{ $labels.instance }} + Instance Name : {{ $labels.job }}"
description: "Instance {{ $labels.instance }} is down for more than 1 minute."
# High Load Average Alert
- alert: HighLoadAverage
expr: node_load1 / count(node_cpu_seconds_total{mode="system"}) * 100 > 80
for: 10m
labels:
severity: warning
environment: prod
annotations:
summary: "High Load Average on {{ $labels.instance }}"
description: "Load average is above 80% of CPU count for more than 5 minutes on instance {{ $labels.instance }}."
# High Disk I/O Wait Alert
- alert: HighDiskIOWait
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > 0.25
for: 10m
labels:
severity: warning
environment: prod
annotations:
summary: "High Disk I/O Wait on {{ $labels.instance }}"
description: "Disk I/O wait is high for more than 5 minutes on instance {{ $labels.instance }}."
# High Network Inbound Traffic Alert
- alert: HighNetworkIn
expr: rate(node_network_receive_bytes_total[5m]) > (100 * 1024 * 1024) # Adjust based on instance bandwidth capacity
for: 2m
labels:
severity: warning
environment: prod
annotations:
summary: "High Network Inbound Traffic on {{ $labels.instance }}"
description: "Inbound network traffic is high on instance {{ $labels.instance }}."
# High Network Outbound Traffic Alert
- alert: HighNetworkOut
expr: rate(node_network_transmit_bytes_total[5m]) > (100 * 1024 * 1024) # Adjust based on instance bandwidth capacity
for: 2m
labels:
severity: warning
environment: prod
annotations:
summary: "High Network Outbound Traffic on {{ $labels.instance }}"
description: "Outbound network traffic is high on instance {{ $labels.instance }}."