@llplmlyd
2021-06-28T17:19:55.000000Z
字数 3651
阅读 643
数据库-TiDB
# TiDB
count(probe_success{tidb_cluster="$tidb_cluster", group="tidb"} == 1)
# PD
count(probe_success{tidb_cluster="$tidb_cluster", group="pd"} == 1)
#TiKV
count(probe_success{tidb_cluster="$tidb_cluster", group="tikv"} == 1)
#TiFlash
count(probe_success{tidb_cluster="$tidb_cluster", group="tiflash"} == 1)
# Pump
count(probe_success{tidb_cluster="$tidb_cluster", group="pump"} == 1)
## 整个集群的磁盘使用率
sum(pd_cluster_status{tidb_cluster="$tidb_cluster", instance="$instance",type="storage_size"}) / sum(pd_cluster_status{tidb_cluster="$tidb_cluster", instance="$instance",type="storage_capacity"})
## 失联的store数量
sum(pd_cluster_status{tidb_cluster="$tidb_cluster", instance="$instance", type="store_disconnected_count"})
## 不健康的store数量
sum(pd_cluster_status{tidb_cluster="$tidb_cluster", instance="$instance", type="store_unhealth_count"})
## 存储空间较低的store数量
sum(pd_cluster_status{tidb_cluster="$tidb_cluster", instance="$instance", type="store_low_space_count"})
## 进程停止的store数量
sum(pd_cluster_status{tidb_cluster="$tidb_cluster", instance="$instance", type="store_down_count"})
## 正在下线的store数量
sum(pd_cluster_status{tidb_cluster="$tidb_cluster", instance="$instance", type="store_offline_count"})
## 已经下线的store数量
sum(pd_cluster_status{tidb_cluster="$tidb_cluster", instance="$instance", type="store_tombstone_count"})
# 当前集群的 Region 总量,请注意 Region 数量与副本数无关
pd_cluster_status{tidb_cluster="$tidb_cluster", instance="$instance", type="leader_count"}
## region health
pd_regions_status{tidb_cluster="$tidb_cluster", instance="$instance"}
sum(pd_regions_status{tidb_cluster="$tidb_cluster"}) by (instance, type)
## PD 启动时间
(time() - process_start_time_seconds{tidb_cluster="$tidb_cluster",job=~".*pd.*"})
## 创建的调度
sum(delta(pd_schedule_operators_count{tidb_cluster="$tidb_cluster", instance="$instance", event="create"}[1m])) by (type)
## 完成的调度
sum(delta(pd_schedule_operators_count{tidb_cluster="$tidb_cluster", instance="$instance", event="finish"}[1m])) by (type)
## TiKV leader info
sum(tikv_raftstore_region_count{tidb_cluster="$tidb_cluster", type="leader"}) by (instance)
## TiKV region info
sum(tikv_raftstore_region_count{tidb_cluster="$tidb_cluster", type="region"}) by (instance)
## TiKV 内存使用率
avg(process_resident_memory_bytes{tidb_cluster="$tidb_cluster", instance=~"$instance"}) by (instance)
## TiKV Server is Busy
sum(rate(tikv_scheduler_too_busy_total{instance=~"$instance"}[1m])) by (instance)
sum(rate(tikv_channel_full_total{instance=~"$instance"}[1m])) by (instance, type)
sum(rate(tikv_coprocessor_request_error{instance=~"$instance", type='full'}[1m])) by (instance)
avg(tikv_engine_write_stall{instance=~"$instance", type="write_stall_percentile99"}) by (instance)
## TiKV 未compact的堆积字节数量,反映写入压力情况
sum(tikv_engine_pending_compaction_bytes{tidb_cluster="$tidb_cluster", instance=~"$instance", db="$db"}) by (cf)
## TiDB info 内存信息
process_resident_memory_bytes{tidb_cluster="$tidb_cluster", job="tidb"}
## TiDB运行时间
(time() - process_start_time_seconds{tidb_cluster="$tidb_cluster", job="tidb"})
## TiDB 各个节点的连接数
tidb_server_connections{tidb_cluster="$tidb_cluster"}
## 集群总的连接数
sum(tidb_server_connections{tidb_cluster="$tidb_cluster"})
## TiDB查询延迟99%
histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket{tidb_cluster="$tidb_cluster", sql_type!="internal"}[1m])) by (le))
## TiDB查询延迟95%
histogram_quantile(0.95, sum(rate(tidb_server_handle_query_duration_seconds_bucket{tidb_cluster="$tidb_cluster", sql_type!="internal"}[1m])) by (le))
## 集群QPS
# 按照类型
sum(rate(tidb_executor_statement_total{tidb_cluster="$tidb_cluster"}[1m])) by (type)
# 总的
sum(rate(tidb_executor_statement_total{tidb_cluster="$tidb_cluster"}[1m]))
## 集群TPS
sum(rate(tidb_session_transaction_duration_seconds_count{tidb_cluster="$tidb_cluster"}[1m])) by (type, txn_mode)