title: MySQL Query Errors Monitor
display:
header: MySQL Error {{ alert.labels.statusCode }}
description: This monitor detects MySQL Query errors.
resourceHeaderLabels:
- span_name
- role
contextHeaderLabels:
- cluster
- namespace
- workload
severity: S3
measurementType: event
model:
queries:
- name: threshold_input_query
dataType: traces
sqlPipeline:
selectors:
- key: _time
origin: root
type: string
processors:
- op: toStartOfInterval
args:
- 1 minutes
alias: bucket_timestamp
- key: statusCode
origin: root
type: string
alias: statusCode
- key: span_name
origin: root
type: string
alias: span_name
- key: cluster
origin: root
type: string
alias: cluster
- key: namespace
origin: root
type: string
alias: namespace
- key: role
origin: root
type: string
alias: role
- key: workload
origin: root
type: string
alias: workload
- key: "*"
origin: root
type: string
processors:
- op: count
alias: logs_total
groupBy:
- key: _time
origin: root
type: string
processors:
- op: toStartOfInterval
args:
- 1 minutes
- key: statusCode
origin: root
type: string
alias: statusCode
- key: span_name
origin: root
type: string
alias: span_name
- key: cluster
origin: root
type: string
alias: cluster
- key: namespace
origin: root
type: string
alias: namespace
- key: role
origin: root
type: string
alias: role
- key: workload
origin: root
type: string
alias: workload
orderBy:
- selector:
key: bucket_timestamp
origin: root
type: string
direction: ASC
limit:
filters:
operator: and
conditions:
- filters:
- op: match
value: mysql
key: eventType
origin: root
type: string
- filters:
- op: match
value: error
key: status
origin: root
type: string
- filters:
- op: match
value: eBPF
key: source
origin: root
type: string
instantRollup: 1 minutes
thresholds:
- name: threshold_1
inputName: threshold_input_query
operator: gt
values:
- 0
executionErrorState: OK
noDataState: OK
evaluationInterval:
interval: 1m
pendingFor: 0s
labels:
team: infratitle: gRPC API Errors Monitor
display:
header: gRPC API Error {{ alert.labels.statusCode }}
description: This monitor detects gRPC API errors by identifying responses with a non-zero status code.
resourceHeaderLabels:
- span_name
- role
contextHeaderLabels:
- cluster
- namespace
- workload
severity: S3
measurementType: event
model:
queries:
- name: threshold_input_query
dataType: traces
sqlPipeline:
selectors:
- key: _time
origin: root
type: string
processors:
- op: toStartOfInterval
args:
- 1 minutes
alias: bucket_timestamp
- key: statusCode
origin: root
type: string
alias: statusCode
- key: span_name
origin: root
type: string
alias: span_name
- key: cluster
origin: root
type: string
alias: cluster
- key: namespace
origin: root
type: string
alias: namespace
- key: role
origin: root
type: string
alias: role
- key: workload
origin: root
type: string
alias: workload
- key: "*"
origin: root
type: string
processors:
- op: count
alias: logs_total
groupBy:
- key: _time
origin: root
type: string
processors:
- op: toStartOfInterval
args:
- 1 minutes
- key: statusCode
origin: root
type: string
alias: statusCode
- key: span_name
origin: root
type: string
alias: span_name
- key: cluster
origin: root
type: string
alias: cluster
- key: namespace
origin: root
type: string
alias: namespace
- key: role
origin: root
type: string
alias: role
- key: workload
origin: root
type: string
alias: workload
orderBy:
- selector:
key: bucket_timestamp
origin: root
type: string
direction: ASC
limit:
filters:
operator: and
conditions:
- filters:
- op: match
value: grpc
key: eventType
origin: root
type: string
- filters:
- op: ne
value: "0"
key: statusCode
origin: root
type: string
- filters:
- op: match
value: error
key: status
origin: root
type: string
- filters:
- op: match
value: eBPF
key: source
origin: root
type: string
instantRollup: 1 minutes
thresholds:
- name: threshold_1
inputName: threshold_input_query
operator: gt
values:
- 0
executionErrorState: OK
noDataState: OK
evaluationInterval:
interval: 1m
pendingFor: 0stitle: High Error Log Rate Monitor
severity: S4
display:
header: High Log Error Rate
description: This monitor will trigger an alert when we have a rate of error logs.
resourceHeaderLabels:
- workload
contextHeaderLabels:
- cluster
- namespace
evaluationInterval:
interval: 1m
pendingFor: 0s
model:
queries:
- name: threshold_input_query
dataType: logs
sqlPipeline:
selectors:
- key: _time
origin: root
type: string
processors:
- op: toStartOfInterval
args:
- 1 minutes
alias: bucket_timestamp
- key: workload
origin: root
type: string
alias: workload
- key: namespace
origin: root
type: string
alias: namespace
- key: cluster
origin: root
type: string
alias: cluster
- key: "*"
origin: root
type: string
processors:
- op: count
alias: logs_total
groupBy:
- key: _time
origin: root
type: string
processors:
- op: toStartOfInterval
args:
- 1 minutes
- key: workload
origin: root
type: string
alias: workload
- key: namespace
origin: root
type: string
alias: namespace
- key: cluster
origin: root
type: string
alias: cluster
orderBy:
- selector:
key: bucket_timestamp
origin: root
type: string
direction: ASC
limit:
filters:
conditions:
- filters:
- op: match
value: error
key: level
origin: root
type: string
operator: and
instantRollup: 1 minutes
thresholds:
- name: threshold_1
inputName: threshold_input_query
operator: gt
values:
- 150
noDataState: OK
measurementType: eventError is set, query execution errors will result in an error state.within_rangeoutside_range