Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
alert: http_server_error
expr: sum(irate(haproxy_backend_http_responses_total{backend!~".*nobid-cluster",code="5xx",job="haproxy"}[5m]))
> sum(irate(haproxy_backend_http_responses_total{backend!~".*nobid-cluster",job="haproxy"}[5m]))
* 0.03
for: 5m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: HTTP 5XX errors more than 3% for more than 1 minutes.
summary: '[APAC-SG] HTTP 5XX error'
|
ok
|
|
6.762s ago
|
439.7ms |
alert: ssp_error
expr: sum(rate(haproxy_backend_http_responses_total{backend="ssp-cluster",code=~"[45]xx"}[5m]))
> sum(rate(haproxy_backend_http_responses_total{backend="ssp-cluster"}[5m]))
* 0.03
for: 5m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: SSP Cluster responds HTTP 4xx/5xx more than 5% total responses for
more than 5 minutes.
summary: '[APAC-SG] SSP Cluster Crashes'
|
ok
|
|
6.323s ago
|
21.86ms |
alert: ssp-cluster_haproxy_response_time_exception
expr: haproxy_backend_http_response_time_average_seconds{backend="ssp-cluster"}
> 0.5
for: 10m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: SSP Cluster response time exception for more than 10 minutes.
summary: '[APAC-SG] SSP Cluster response time exception'
|
ok
|
|
6.301s ago
|
3.246ms |
alert: joox-ssp_error
expr: sum(rate(haproxy_backend_http_responses_total{backend="joox-ssp-cluster",code=~"[45]xx"}[5m]))
> sum(rate(haproxy_backend_http_responses_total{backend="joox-ssp-cluster"}[5m]))
* 0.03
for: 5m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: Joox-SSP Cluster responds HTTP 4xx/5xx more than 5% total responses
for more than 5 minutes.
summary: '[APAC-SG] Joox-SSP Cluster Crashes'
|
ok
|
|
6.298s ago
|
21.85ms |
alert: joox-adx_error
expr: sum(rate(haproxy_backend_http_responses_total{backend="joox-adx-cluster",code=~"[45]xx"}[5m]))
> sum(rate(haproxy_backend_http_responses_total{backend="joox-adx-cluster"}[5m]))
* 0.03
for: 5m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: Joox-ADX Cluster responds HTTP 4xx/5xx more than 5% total responses
for more than 5 minutes.
summary: '[APAC-SG] Joox-ADX Cluster Crashes'
|
ok
|
|
6.277s ago
|
20.98ms |
alert: joox-adx-cluster_haproxy_response_time_exception
expr: haproxy_backend_http_response_time_average_seconds{backend="joox-adx-cluster"}
> 0.5
for: 10m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: Joox-ADX Cluster response time exception for more than 10 minutes.
summary: '[APAC-SG] Joox-ADX Cluster response time exception'
|
ok
|
|
6.256s ago
|
2.539ms |
alert: adtrack_error
expr: sum(rate(haproxy_backend_http_responses_total{backend="adtrack-cluster",code=~"[45]xx"}[5m]))
> sum(rate(haproxy_backend_http_responses_total{backend="adtrack-cluster"}[5m]))
* 0.03
for: 5m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: ADTRACK Cluster responds HTTP 4xx/5xx more than 5% total responses
for more than 5 minutes.
summary: '[APAC-SG] ADTRACK Cluster Crashes'
|
ok
|
|
6.254s ago
|
32.01ms |
alert: adtrack-cluster_haproxy_response_time_exception
expr: haproxy_backend_http_response_time_average_seconds{backend="adtrack-cluster"}
> 0.1
for: 10m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: Adtrack Cluster response time exception for more than 10 minutes.
summary: '[APAC-SG] Adtrack Cluster response time exception'
|
ok
|
|
6.222s ago
|
3.66ms |
alert: idsync_error
expr: sum(rate(haproxy_backend_http_responses_total{backend="idsync-cluster",code=~"[45]xx"}[5m]))
> sum(rate(haproxy_backend_http_responses_total{backend="idsync-cluster"}[5m]))
* 0.03
for: 5m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: Idsync Cluster responds HTTP 4xx/5xx more than 5% total responses for
more than 5 minutes.
summary: '[APAC-SG] Idsync Cluster Crashes'
|
ok
|
|
6.219s ago
|
28.5ms |
alert: idsync-cluster_haproxy_response_time_exception
expr: haproxy_backend_http_response_time_average_seconds{backend="idsync-cluster"}
> 0.1
for: 10m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: Idsync Cluster response time exception for more than 10 minutes.
summary: '[APAC-SG] Idsync Cluster response time exception'
|
ok
|
|
6.19s ago
|
3.747ms |
alert: logserver_error
expr: sum(rate(haproxy_backend_http_responses_total{backend="log-cluster",code=~"[45]xx"}[5m]))
> sum(rate(haproxy_backend_http_responses_total{backend="log-cluster"}[5m]))
* 0.03
for: 5m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: LogServer Cluster responds HTTP 4xx/5xx more than 5% total responses
for more than 5 minutes.
summary: '[APAC-SG] LogServer Cluster Crashes'
|
ok
|
|
6.187s ago
|
27.86ms |
alert: log-cluster_haproxy_response_time_exception
expr: haproxy_backend_http_response_time_average_seconds{backend="log-cluster"}
> 0.1
for: 10m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: Log Cluster response time exception for more than 10 minutes.
summary: '[APAC-SG] Log Cluster response time exception'
|
ok
|
|
6.159s ago
|
3.116ms |
alert: dsp_error
expr: sum(rate(haproxy_backend_http_responses_total{backend="dsp-cluster",code=~"[45]xx"}[5m]))
> sum(rate(haproxy_backend_http_responses_total{backend="dsp-cluster"}[5m]))
* 0.03
for: 5m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: DSP Cluster responds HTTP 4xx/5xx more than 5% total responses for
more than 5 minutes.
summary: '[APAC-SG] DSP Cluster Crashes'
|
ok
|
|
6.156s ago
|
27.58ms |
alert: dsp-cluster_haproxy_response_time_exception
expr: haproxy_backend_http_response_time_average_seconds{backend="dsp-cluster"}
> 0.15
for: 10m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: DSP Cluster response time exception for more than 10 minutes.
summary: '[APAC-SG] DSP Cluster response time exception'
|
ok
|
|
6.129s ago
|
3.332ms |
alert: adx_error
expr: sum(rate(haproxy_backend_http_responses_total{backend="adx-cluster",code=~"[45]xx"}[5m]))
> sum(rate(haproxy_backend_http_responses_total{backend="adx-cluster"}[5m]))
* 0.03
for: 5m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: ADX Cluster responds HTTP 4xx/5xx more than 5% total responses for
more than 5 minutes.
summary: '[APAC-SG] ADX Cluster Crashes'
|
ok
|
|
6.126s ago
|
28.19ms |
alert: adx-cluster_haproxy_response_time_exception
expr: haproxy_backend_http_response_time_average_seconds{backend="adx-cluster"}
> 0.3
for: 10m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: ADX Cluster response time exception for more than 10 minutes.
summary: '[APAC-SG] ADX Cluster response time exception'
|
ok
|
|
6.098s ago
|
4.291ms |
alert: hb_error
expr: sum(rate(haproxy_backend_http_responses_total{backend="hb-cluster",code=~"[45]xx"}[5m]))
> sum(rate(haproxy_backend_http_responses_total{backend="hb-cluster"}[5m]))
* 0.03
for: 5m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: HB Cluster responds HTTP 4xx/5xx more than 5% total responses for more
than 5 minutes.
summary: '[APAC-SG] HB Cluster Crashes'
|
ok
|
|
6.094s ago
|
32.24ms |
alert: hb-cluster_haproxy_response_time_exception
expr: haproxy_backend_http_response_time_average_seconds{backend="hb-cluster"}
> 0.5
for: 10m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: HB Cluster response time exception for more than 10 minutes.
summary: '[APAC-SG] HB Cluster response time exception'
|
ok
|
|
6.062s ago
|
3.437ms |
alert: hbwa_error
expr: sum(rate(haproxy_backend_http_responses_total{backend="hbwa-cluster",code=~"[5]xx"}[5m]))
> sum(rate(haproxy_backend_http_responses_total{backend="hbwa-cluster"}[5m]))
* 0.03
for: 5m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: HBWA Cluster responds HTTP 4xx/5xx more than 5% total responses for
more than 5 minutes.
summary: '[APAC-SG] HBWA Cluster Crashes'
|
ok
|
|
6.058s ago
|
19.23ms |
alert: hbwa-cluster_haproxy_response_time_exception
expr: haproxy_backend_http_response_time_average_seconds{backend="hbwa-cluster"}
> 0.1
for: 10m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: HBWA Cluster response time exception for more than 10 minutes.
summary: '[APAC-SG] HBWA Cluster response time exception'
|
ok
|
|
6.039s ago
|
2.416ms |
alert: InstanceDown
expr: up == 0
for: 1m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for
more than 1 minutes.'
summary: '[APAC-SG] Instance {{ $labels.instance }} down'
|
ok
|
|
6.037s ago
|
3.001ms |
alert: short_free_storage_error
expr: node_filesystem_free_bytes{mountpoint="/etc/hosts"}
/ node_filesystem_size_bytes < 0.1
for: 5m
labels:
datacenter: APAC-SG
service: server
severity: page
annotations:
description: 'Instance {{ $labels.instance }} has low free storage (current values:
{{ $value }})'
summary: '[APAC-SG] Instance {{ $labels.instance }} has low free storage'
|
ok
|
|
6.034s ago
|
1.303ms |
alert: low_avail_mem_error
expr: (node_memory_MemFree_bytes{job="node"}
+ node_memory_Cached_bytes{job="node"} + node_memory_Buffers_bytes{job="node"})
/ node_memory_MemTotal_bytes < 0.1
for: 5m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: 'Instance {{ $labels.instance }} has low available memory (current
values: {{ $value }})'
summary: '[APAC-SG] Instance {{ $labels.instance }} has low available memory'
|
ok
|
|
6.033s ago
|
1.433ms |
alert: too_much_mongo_connection
expr: (mongodb_connections{job="mongo",state="current"}
or mongodb_ss_connections{conn_type="current"}) > 10000
for: 5m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: 'MongoDB instance {{ $labels.instance }} has too much connection (current
values: {{ $value }})'
summary: '[APAC-SG] MongoDB instance {{ $labels.instance }} has too much connection'
|
ok
|
|
6.032s ago
|
412.9us |
alert: too_high_cpu_usage
expr: 100
* (1 - avg by(instance) (irate(node_cpu_seconds_total{job="node",mode="idle"}[5m])))
> 85
for: 10m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: 'Instance {{ $labels.instance }} has too high CPU usage (current values:
{{ $value }})'
summary: '[APAC-SG] Instance {{ $labels.instance }} CPU usage exceed 80%'
|
ok
|
|
6.032s ago
|
22.99ms |
alert: mongodb_replication_lag
expr: ((mongodb_replset_my_replica_lag)
or (mongodb_mongod_replset_member_replication_lag{state!="ARBITER"})) >
7200
for: 30m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: 'MongoDB instance {{ $labels.instance }} mongodb replication too lag
(current values: {{ $value }})'
summary: '[APAC-SG] MongoDB instance {{ $labels.instance }} mongodb replication
too lag'
|
ok
|
|
6.009s ago
|
680.4us |
alert: redis_master_last_io_seconds
expr: (redis_master_last_io_seconds)
== -1
for: 30m
labels:
datacenter: APAC-SG
service: system
severity: page
annotations:
description: 'Redis instance {{ $labels.instance }} Redis connection with master
lost (current values: {{ $value }})'
summary: '[APAC-SG] Redis instance {{ $labels.instance }} Redis connection with
master lost'
|
ok
|
|
6.009s ago
|
205.9us |
alert: Redis_Down
expr: redis_up != 1
for: 1m
labels:
datacenter: APAC-SG
service: system
severity: page
|
ok
|
|
6.009s ago
|
290.2us |
alert: node_high_disk_io_usage
expr: (rate(node_disk_io_time_seconds_total[5m])
or irate(node_disk_io_time_seconds_total[5m])) * 100 > 90
for: 5m
labels:
datacenter: APAC-SG
service: server
severity: page
annotations:
description: 'Node instance {{ $labels.instance }} Node has high disk io usage (current
values: {{ $value }})'
summary: '[APAC-SG] Node instance {{ $labels.instance }} Node has high disk io usage'
|
ok
|
|
6.009s ago
|
1.265ms |