# This Is the Alert Rules Template for AutoMQ, Please Modify the Alert Thresholds and Period Per Your Needs
# Before Applying It to Your Production Environment.
groups:
- name: kafka_alerts
rules:
- alert: HighTopicBytesInPerSec
expr: sum(max(rate(kafka_network_io_bytes_total{direction="in", topic="example_topic"}[1m])) by (job, topic, partition)) by (job, topic) > 50 * 1024 * 1024
for: 5m
labels:
severity: warning
annotations:
summary: "High inbound network throughput {{ printf \"%0.2f\" $value }} Bytes/s for topic {{ $labels.topic }} in cluster {{ $labels.job }}"
description: "The inbound bytes per second produced by topic {{ $labels.topic }} in cluster {{ $labels.job }} is exceeding threshold."
- alert: LowTopicBytesInPerSec
expr: sum(max(rate(kafka_network_io_bytes_total{direction="in", topic="example_topic"}[1m])) by (job, topic, partition)) by (job, topic) < 1024
for: 5m
labels:
severity: warning
annotations:
summary: "Low inbound network throughput {{ printf \"%0.2f\" $value }} Bytes/s for topic {{ $labels.topic }} in cluster {{ $labels.job }}"
description: "The inbound bytes per second produced by topic {{ $labels.topic }} in cluster {{ $labels.job }} is below threshold."
- alert: HighTopicBytesOutPerSec
expr: sum(max(rate(kafka_network_io_bytes_total{direction="out", topic="example_topic"}[1m])) by (job, topic, partition)) by (job, topic) > 50 * 1024 * 1024
for: 5m
labels:
severity: warning
annotations:
summary: "High outbound network throughput {{ printf \"%0.2f\" $value }} Bytes/s for topic {{ $labels.topic }} in cluster {{ $labels.job }}"
description: "The outbound bytes per second produced by topic {{ $labels.topic }} in cluster {{ $labels.job }} is exceeding threshold."
- alert: LowTopicBytesOutPerSec
expr: sum(max(rate(kafka_network_io_bytes_total{direction="out", topic="example_topic"}[1m])) by (job, topic, partition)) by (job, topic) < 1024
for: 5m
labels:
severity: warning
annotations:
summary: "Low outbound network throughput {{ printf \"%0.2f\" $value }} Bytes/s for topic {{ $labels.topic }} in cluster {{ $labels.job }}"
description: "The outbound bytes per second produced by topic {{ $labels.topic }} in cluster {{ $labels.job }} is below threshold."
- alert: HighGroupConsumeRatePerTopic
expr: sum(max(rate(kafka_group_commit_offset{consumer_group="example_group", topic="example_topic"}[1m])) by (job, consumer_group, topic, partition)) by (job, consumer_group, topic) > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "High group consume rate {{ printf \"%0.2f\" $value }} msg/s for consumer group {{ $labels.consumer_group }} on topic {{ $labels.topic }} in cluster {{ $labels.job }}"
description: "The consume rate of consumer group {{ $labels.consumer_group }} on topic {{ $labels.topic }} in cluster {{ $labels.job }} is exceeding threshold."
- alert: LowGroupConsumeRatePerTopic
expr: sum(max(rate(kafka_group_commit_offset{consumer_group="example_group", topic="example_topic"}[1m])) by (job, consumer_group, topic, partition)) by (job, consumer_group, topic) < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Low group consume rate {{ printf \"%0.2f\" $value }} msg/s for consumer group {{ $labels.consumer_group }} on topic {{ $labels.topic }} in cluster {{ $labels.job }}"
description: "The consume rate of consumer group {{ $labels.consumer_group }} on topic {{ $labels.topic }} in cluster {{ $labels.job }} is below threshold."
- alert: HighTopicBytesInPerSecDayToDayChange
expr: (sum(max(rate(kafka_network_io_bytes_total{direction="in", topic="example_topic"}[1m])) by (job, topic, partition)) by (job, topic)
- sum(max(rate(kafka_network_io_bytes_total{direction="in", topic="example_topic"}[1m] offset 24h)) by (job, topic, partition)) by (job, topic))
/ sum(max(rate(kafka_network_io_bytes_total{direction="in", topic="example_topic"}[1m] offset 24h)) by (job, topic, partition)) by (job, topic) > 0.2
for: 5m
labels:
severity: warning
annotations:
summary: "High inbound network throughput change {{ printf \"%0.2f\" $value }} for topic {{ $labels.topic }} in cluster {{ $labels.job }}"
description: "The increase of inbound bytes per second produced by topic {{ $labels.topic }} in cluster {{ $labels.job }} compared to 24h ago is exceeding threshold"
- alert: LowTopicBytesInPerSecDayToDayChange
expr: (sum(max(rate(kafka_network_io_bytes_total{direction="in", topic="example_topic"}[1m])) by (job, topic, partition)) by (job, topic)
- sum(max(rate(kafka_network_io_bytes_total{direction="in", topic="example_topic"}[1m] offset 24h)) by (job, topic, partition)) by (job, topic))
/ sum(max(rate(kafka_network_io_bytes_total{direction="in", topic="example_topic"}[1m] offset 24h)) by (job, topic, partition)) by (job, topic) < -0.2
for: 5m
labels:
severity: warning
annotations:
summary: "Low inbound network throughput change {{ printf \"%0.2f\" $value }} for topic {{ $labels.topic }} in cluster {{ $labels.job }}"
description: "The decrease of inbound bytes per second produced by topic {{ $labels.topic }} in cluster {{ $labels.job }} compared to 24h ago is exceeding threshold"
- alert: HighTopicBytesOutPerSecDayToDayChange
expr: (sum(max(rate(kafka_network_io_bytes_total{direction="out", topic="example_topic"}[1m])) by (job, topic, partition)) by (job, topic)
- sum(max(rate(kafka_network_io_bytes_total{direction="out", topic="example_topic"}[1m] offset 24h)) by (job, topic, partition)) by (job, topic))
/ sum(max(rate(kafka_network_io_bytes_total{direction="out", topic="example_topic"}[1m] offset 24h)) by (job, topic, partition)) by (job, topic) > 0.2
for: 5m
labels:
severity: warning
annotations:
summary: "High outbound network throughput change {{ printf \"%0.2f\" $value }} for topic {{ $labels.topic }} in cluster {{ $labels.job }}"
description: "The increase of outbound bytes per second produced by topic {{ $labels.topic }} in cluster {{ $labels.job }} compared to 24h ago is exceeding threshold"
- alert: LowTopicBytesOutPerSecDayToDayChange
expr: (sum(max(rate(kafka_network_io_bytes_total{direction="out", topic="example_topic"}[1m])) by (job, topic, partition)) by (job, topic)
- sum(max(rate(kafka_network_io_bytes_total{direction="out", topic="example_topic"}[1m] offset 24h)) by (job, topic, partition)) by (job, topic))
/ sum(max(rate(kafka_network_io_bytes_total{direction="out", topic="example_topic"}[1m] offset 24h)) by (job, topic, partition)) by (job, topic) < -0.2
for: 5m
labels:
severity: warning
annotations:
summary: "Low outbound network throughput change {{ printf \"%0.2f\" $value }} for topic {{ $labels.topic }} in cluster {{ $labels.job }}"
description: "The decrease of outbound bytes per second produced by topic {{ $labels.topic }} in cluster {{ $labels.job }} compared to 24h ago is exceeding threshold"
- alert: HighGroupConsumerLag
expr: (sum(max(kafka_log_end_offset{topic="example_topic"}) by (job, topic, partition)) by (job, topic)
- on (topic) group_left (consumer_group) sum(max(kafka_group_commit_offset{consumer_group="example_group", topic="example_topic"}) by (job, consumer_group, topic, partition)) by (job, consumer_group, topic) > 10000)
and (count(max(kafka_log_end_offset{topic="example_topic"}) by (job, topic, partition)) by (job, topic) == count(max(kafka_group_commit_offset{consumer_group="example_group", topic="example_topic"}) by (job, consumer_group, topic, partition)) by (job, consumer_group, topic))
for: 1m
labels:
severity: warning
annotations:
summary: "High group consumer lag {{ printf \"%0.f\" $value }} for consumer group {{ $labels.consumer_group }} in cluster {{ $labels.job }} on topic {{ $labels.topic }}."
description: "The consumer lag of consumer group {{ $labels.consumer_group }} in cluster {{ $labels.job }} on topic {{ $labels.topic }} is exceeding threshold."