Alerts

/etc/prometheus/alert.rules > APAC-SG
InstanceDown (0 active)
alert: InstanceDown
expr: up == 0
for: 1m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for
    more than 1 minutes.'
  summary: '[APAC-SG] Instance {{ $labels.instance }} down'
Redis_Down (0 active)
alert: Redis_Down
expr: redis_up != 1
for: 1m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
adtrack-cluster_haproxy_response_time_exception (0 active)
alert: adtrack-cluster_haproxy_response_time_exception
expr: haproxy_backend_http_response_time_average_seconds{backend="adtrack-cluster"}
  > 0.1
for: 10m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: Adtrack Cluster response time exception for more than 10 minutes.
  summary: '[APAC-SG] Adtrack Cluster response time exception'
adtrack_error (0 active)
alert: adtrack_error
expr: sum(rate(haproxy_backend_http_responses_total{backend="adtrack-cluster",code=~"[45]xx"}[5m]))
  > sum(rate(haproxy_backend_http_responses_total{backend="adtrack-cluster"}[5m]))
  * 0.03
for: 5m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: ADTRACK Cluster responds HTTP 4xx/5xx more than 5% total responses
    for more than 5 minutes.
  summary: '[APAC-SG] ADTRACK Cluster Crashes'
adx-cluster_haproxy_response_time_exception (0 active)
alert: adx-cluster_haproxy_response_time_exception
expr: haproxy_backend_http_response_time_average_seconds{backend="adx-cluster"}
  > 0.3
for: 10m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: ADX Cluster response time exception for more than 10 minutes.
  summary: '[APAC-SG] ADX Cluster response time exception'
adx_error (0 active)
alert: adx_error
expr: sum(rate(haproxy_backend_http_responses_total{backend="adx-cluster",code=~"[45]xx"}[5m]))
  > sum(rate(haproxy_backend_http_responses_total{backend="adx-cluster"}[5m]))
  * 0.03
for: 5m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: ADX Cluster responds HTTP 4xx/5xx more than 5% total responses for
    more than 5 minutes.
  summary: '[APAC-SG] ADX Cluster Crashes'
dsp-cluster_haproxy_response_time_exception (0 active)
alert: dsp-cluster_haproxy_response_time_exception
expr: haproxy_backend_http_response_time_average_seconds{backend="dsp-cluster"}
  > 0.15
for: 10m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: DSP Cluster response time exception for more than 10 minutes.
  summary: '[APAC-SG] DSP Cluster response time exception'
dsp_error (0 active)
alert: dsp_error
expr: sum(rate(haproxy_backend_http_responses_total{backend="dsp-cluster",code=~"[45]xx"}[5m]))
  > sum(rate(haproxy_backend_http_responses_total{backend="dsp-cluster"}[5m]))
  * 0.03
for: 5m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: DSP Cluster responds HTTP 4xx/5xx more than 5% total responses for
    more than 5 minutes.
  summary: '[APAC-SG] DSP Cluster Crashes'
hb-cluster_haproxy_response_time_exception (0 active)
alert: hb-cluster_haproxy_response_time_exception
expr: haproxy_backend_http_response_time_average_seconds{backend="hb-cluster"}
  > 0.5
for: 10m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: HB Cluster response time exception for more than 10 minutes.
  summary: '[APAC-SG] HB Cluster response time exception'
hb_error (0 active)
alert: hb_error
expr: sum(rate(haproxy_backend_http_responses_total{backend="hb-cluster",code=~"[45]xx"}[5m]))
  > sum(rate(haproxy_backend_http_responses_total{backend="hb-cluster"}[5m]))
  * 0.03
for: 5m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: HB Cluster responds HTTP 4xx/5xx more than 5% total responses for more
    than 5 minutes.
  summary: '[APAC-SG] HB Cluster Crashes'
hbwa-cluster_haproxy_response_time_exception (0 active)
alert: hbwa-cluster_haproxy_response_time_exception
expr: haproxy_backend_http_response_time_average_seconds{backend="hbwa-cluster"}
  > 0.1
for: 10m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: HBWA Cluster response time exception for more than 10 minutes.
  summary: '[APAC-SG] HBWA Cluster response time exception'
hbwa_error (0 active)
alert: hbwa_error
expr: sum(rate(haproxy_backend_http_responses_total{backend="hbwa-cluster",code=~"[5]xx"}[5m]))
  > sum(rate(haproxy_backend_http_responses_total{backend="hbwa-cluster"}[5m]))
  * 0.03
for: 5m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: HBWA Cluster responds HTTP 4xx/5xx more than 5% total responses for
    more than 5 minutes.
  summary: '[APAC-SG] HBWA Cluster Crashes'
http_server_error (0 active)
alert: http_server_error
expr: sum(irate(haproxy_backend_http_responses_total{backend!~".*nobid-cluster",code="5xx",job="haproxy"}[5m]))
  > sum(irate(haproxy_backend_http_responses_total{backend!~".*nobid-cluster",job="haproxy"}[5m]))
  * 0.03
for: 5m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: HTTP 5XX errors more than 3% for more than 1 minutes.
  summary: '[APAC-SG] HTTP 5XX error'
idsync-cluster_haproxy_response_time_exception (0 active)
alert: idsync-cluster_haproxy_response_time_exception
expr: haproxy_backend_http_response_time_average_seconds{backend="idsync-cluster"}
  > 0.1
for: 10m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: Idsync Cluster response time exception for more than 10 minutes.
  summary: '[APAC-SG] Idsync Cluster response time exception'
idsync_error (0 active)
alert: idsync_error
expr: sum(rate(haproxy_backend_http_responses_total{backend="idsync-cluster",code=~"[45]xx"}[5m]))
  > sum(rate(haproxy_backend_http_responses_total{backend="idsync-cluster"}[5m]))
  * 0.03
for: 5m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: Idsync Cluster responds HTTP 4xx/5xx more than 5% total responses for
    more than 5 minutes.
  summary: '[APAC-SG] Idsync Cluster Crashes'
joox-adx-cluster_haproxy_response_time_exception (0 active)
alert: joox-adx-cluster_haproxy_response_time_exception
expr: haproxy_backend_http_response_time_average_seconds{backend="joox-adx-cluster"}
  > 0.5
for: 10m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: Joox-ADX Cluster response time exception for more than 10 minutes.
  summary: '[APAC-SG] Joox-ADX Cluster response time exception'
joox-adx_error (0 active)
alert: joox-adx_error
expr: sum(rate(haproxy_backend_http_responses_total{backend="joox-adx-cluster",code=~"[45]xx"}[5m]))
  > sum(rate(haproxy_backend_http_responses_total{backend="joox-adx-cluster"}[5m]))
  * 0.03
for: 5m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: Joox-ADX Cluster responds HTTP 4xx/5xx more than 5% total responses
    for more than 5 minutes.
  summary: '[APAC-SG] Joox-ADX Cluster Crashes'
joox-ssp_error (0 active)
alert: joox-ssp_error
expr: sum(rate(haproxy_backend_http_responses_total{backend="joox-ssp-cluster",code=~"[45]xx"}[5m]))
  > sum(rate(haproxy_backend_http_responses_total{backend="joox-ssp-cluster"}[5m]))
  * 0.03
for: 5m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: Joox-SSP Cluster responds HTTP 4xx/5xx more than 5% total responses
    for more than 5 minutes.
  summary: '[APAC-SG] Joox-SSP Cluster Crashes'
log-cluster_haproxy_response_time_exception (0 active)
alert: log-cluster_haproxy_response_time_exception
expr: haproxy_backend_http_response_time_average_seconds{backend="log-cluster"}
  > 0.1
for: 10m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: Log Cluster response time exception for more than 10 minutes.
  summary: '[APAC-SG] Log Cluster response time exception'
logserver_error (0 active)
alert: logserver_error
expr: sum(rate(haproxy_backend_http_responses_total{backend="log-cluster",code=~"[45]xx"}[5m]))
  > sum(rate(haproxy_backend_http_responses_total{backend="log-cluster"}[5m]))
  * 0.03
for: 5m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: LogServer Cluster responds HTTP 4xx/5xx more than 5% total responses
    for more than 5 minutes.
  summary: '[APAC-SG] LogServer Cluster Crashes'
low_avail_mem_error (0 active)
alert: low_avail_mem_error
expr: (node_memory_MemFree_bytes{job="node"}
  + node_memory_Cached_bytes{job="node"} + node_memory_Buffers_bytes{job="node"})
  / node_memory_MemTotal_bytes < 0.1
for: 5m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: 'Instance {{ $labels.instance }} has low available memory (current
    values: {{ $value }})'
  summary: '[APAC-SG] Instance {{ $labels.instance }} has low available memory'
mongodb_replication_lag (0 active)
alert: mongodb_replication_lag
expr: ((mongodb_replset_my_replica_lag)
  or (mongodb_mongod_replset_member_replication_lag{state!="ARBITER"})) >
  7200
for: 30m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: 'MongoDB instance {{ $labels.instance }} mongodb replication too lag
    (current values: {{ $value }})'
  summary: '[APAC-SG] MongoDB instance {{ $labels.instance }} mongodb replication
    too lag'
node_high_disk_io_usage (0 active)
alert: node_high_disk_io_usage
expr: (rate(node_disk_io_time_seconds_total[5m])
  or irate(node_disk_io_time_seconds_total[5m])) * 100 > 90
for: 5m
labels:
  datacenter: APAC-SG
  service: server
  severity: page
annotations:
  description: 'Node instance {{ $labels.instance }} Node has high disk io usage (current
    values: {{ $value }})'
  summary: '[APAC-SG] Node instance {{ $labels.instance }} Node has high disk io usage'
redis_master_last_io_seconds (0 active)
alert: redis_master_last_io_seconds
expr: (redis_master_last_io_seconds)
  == -1
for: 30m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: 'Redis instance {{ $labels.instance }} Redis connection with master
    lost (current values: {{ $value }})'
  summary: '[APAC-SG] Redis instance {{ $labels.instance }} Redis connection with
    master lost'
short_free_storage_error (0 active)
alert: short_free_storage_error
expr: node_filesystem_free_bytes{mountpoint="/etc/hosts"}
  / node_filesystem_size_bytes < 0.1
for: 5m
labels:
  datacenter: APAC-SG
  service: server
  severity: page
annotations:
  description: 'Instance {{ $labels.instance }} has low free storage (current values:
    {{ $value }})'
  summary: '[APAC-SG] Instance {{ $labels.instance }} has low free storage'
ssp-cluster_haproxy_response_time_exception (0 active)
alert: ssp-cluster_haproxy_response_time_exception
expr: haproxy_backend_http_response_time_average_seconds{backend="ssp-cluster"}
  > 0.5
for: 10m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: SSP Cluster response time exception for more than 10 minutes.
  summary: '[APAC-SG] SSP Cluster response time exception'
ssp_error (0 active)
alert: ssp_error
expr: sum(rate(haproxy_backend_http_responses_total{backend="ssp-cluster",code=~"[45]xx"}[5m]))
  > sum(rate(haproxy_backend_http_responses_total{backend="ssp-cluster"}[5m]))
  * 0.03
for: 5m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: SSP Cluster responds HTTP 4xx/5xx more than 5% total responses for
    more than 5 minutes.
  summary: '[APAC-SG] SSP Cluster Crashes'
too_high_cpu_usage (0 active)
alert: too_high_cpu_usage
expr: 100
  * (1 - avg by(instance) (irate(node_cpu_seconds_total{job="node",mode="idle"}[5m])))
  > 85
for: 10m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: 'Instance {{ $labels.instance }} has too high CPU usage (current values:
    {{ $value }})'
  summary: '[APAC-SG] Instance {{ $labels.instance }} CPU usage exceed 80%'
too_much_mongo_connection (0 active)
alert: too_much_mongo_connection
expr: (mongodb_connections{job="mongo",state="current"}
  or mongodb_ss_connections{conn_type="current"}) > 10000
for: 5m
labels:
  datacenter: APAC-SG
  service: system
  severity: page
annotations:
  description: 'MongoDB instance {{ $labels.instance }} has too much connection (current
    values: {{ $value }})'
  summary: '[APAC-SG] MongoDB instance {{ $labels.instance }} has too much connection'