Documentation ¶
Overview ¶
Package metrics provides general system and process level metrics collection.
Index ¶
- Constants
- Variables
- func CollectProcessMetrics(refresh time.Duration)
- func NewAlarmManager() *alarmManager
- func NewCounter(name string) metrics.Counter
- func NewGauge(name string) metrics.Gauge
- func NewMeter(name string) metrics.Meter
- func NewTimer(name string) metrics.Timer
- func PointMetricsLog()
- func ReadDiskStats(stats *DiskStats) error
- func SprintMetrics(metricsName string, i interface{}) []string
- func ToStrings(str ...string) []string
- func WriteMetricsData(r metrics.Registry, refresh time.Duration)
- type Condition
- type DiskStats
- type MetricsMap
Constants ¶
const ( TypeCount metricsType = iota // 0 TypeRate1 TypeRate5 TypeRate15 TypeRateMean TypeMean TypeMax TypeMin TypeStdDev TypeSum TypeVariance TypeValue )
const (
CodeHeartbeat = uint32(0x01) // 心跳包msg code
)
const LevelDBPrefix = "glemo/db/chaindata/"
Variables ¶
var ( PackagePrefix = []byte{0x77, 0x88} // package flag PackageLength = 4 // msg长度所占字节的个数 )
var ( MetricsEnabledFlagValue = "metrics" Enabled = false // 是否激活metrics,通过检测到配置文件是否配置了告警server的url来判断是否激活 AlarmUrl string // 告警系统server的url,通过配置文件传进来 )
Enabled is the flag specifying if metrics are enable or not.
var ( InvalidTx_meterName = "txpool/DelInvalidTxs/invalid" TxpoolNumber_counterName = "txpool/totalTxNumber" VerifyFailedTx_meterName = "tx/VerifyTxBody/verifyFailed" HandleBlocksMsg_meterName = "network/protocol_manager/handleBlocksMsg" // 统计调用handleBlocksMsg的频率 HandleGetBlocksMsg_meterName = "network/protocol_manager/handleGetBlocksMsg" // 统计调用handleGetBlocksMsg的频率 HandleBlockHashMsg_meterName = "network/protocol_manager/handleBlockHashMsg" // 统计调用handleBlockHashMsg的频率 HandleGetConfirmsMsg_meterName = "network/protocol_manager/handleGetConfirmsMsg" // 统计调用handleGetConfirmsMsg的频率 HandleConfirmMsg_meterName = "network/protocol_manager/handleConfirmMsg" // 统计调用handleConfirmMsg的频率 HandleGetBlocksWithChangeLogMsg_meterName = "network/protocol_manager/handleGetBlocksWithChangeLogMsg" // 统计调用handleGetBlocksWithChangeLogMsg的频率 HandleDiscoverReqMsg_meterName = "network/protocol_manager/handleDiscoverReqMsg" // 统计调用handleDiscoverReqMsg的频率 HandleDiscoverResMsg_meterName = "network/protocol_manager/handleDiscoverResMsg" // 统计调用handleDiscoverResMsg的频率 LevelDb_get_timerName = LevelDBPrefix + "user/gets" LevelDb_put_timerName = LevelDBPrefix + "user/puts" LevelDb_del_timerName = LevelDBPrefix + "user/dels" LevelDb_miss_meterName = LevelDBPrefix + "user/misses" // 对数据库进行get操作失败的频率 LevelDb_read_meterName = LevelDBPrefix + "user/reads" // get数据库出来的数据字节大小 LevelDb_write_meterName = LevelDBPrefix + "user/writes" // put进数据库的数据字节大小 LevelDb_compTime_meteName = LevelDBPrefix + "user/time" LevelDb_compRead_meterName = LevelDBPrefix + "user/input" LevelDb_compWrite_meterName = LevelDBPrefix + "user/output" BlockInsert_timerName = "consensus/InsertBlock/insertBlock" // 统计区块插入链中的速率和所用时间的分布情况 MineBlock_timerName = "consensus/MineBlock/mineBlock" // 统计出块速率和时间分布 VerifyBlock_meterName = "consensus/dpovp" // 校验收到的区块失败的频率 UnStableBlock_meterName = "consensus/dpovp/saveNewBlock" // 未稳定块过多 // 告警条件 Alarm_BlockInsert float64 = 5 // Insert chain 所用平均时间大于5s Alarm_MineBlock float64 = 8 // Mine Block 所用平均时间大于8s PeerConnFailed_meterName = "p2p/listenLoop/failedHandleConn" ReadMsgSuccess_timerName = "p2p/readLoop/readMsgSuccess" // 统计成功读取msg的timer ReadMsgFailed_timerName = "p2p/readLoop/readMsgFailed" // 统计读取msg失败的timer WriteMsgSuccess_timerName = "p2p/WriteMsg/writeMsgSuccess" // 统计写msg成功的timer WriteMsgFailed_timerName = "p2p/WriteMsg/writeMsgFailed" // 统计写msg失败的timer System_memory_allocs = "system/memory/allocs" // 申请内存的次数 System__memory_frees = "system/memory/frees" // 释放内存的次数 System_memory_inuse = "system/memory/inuse" // 已申请且仍在使用的字节数 System_memory_pauses = "system/memory/pauses" // GC总的暂停时间的循环缓冲 System_disk_readCount = "system/disk/readcount" // 读磁盘操作次数 System_disk_readData = "system/disk/readdata" // 读取的字节总数 System_disk_writeCount = "system/disk/writecount" // 写磁盘操作次数 System_disk_writeData = "system/disk/writedata" // 写的字节总数 )
var AlarmRuleTable = map[string]*Condition{ InvalidTx_meterName: { AlarmReason: "最近的一分钟时间内有大于30笔交易执行失败了", MetricsType: TypeRate1, AlarmValue: 0.5, AlarmMsgCode: textMsgCode, }, TxpoolNumber_counterName: { AlarmReason: "交易池中的交易大于5000笔了", MetricsType: TypeCount, AlarmValue: 5000, AlarmMsgCode: textMsgCode, }, HandleBlocksMsg_meterName: { AlarmReason: "最近一分钟时间内收到其他节点广播过来的blocks消息次数大于60次", MetricsType: TypeRate1, AlarmValue: 1, AlarmMsgCode: textMsgCode, }, HandleGetBlocksMsg_meterName: { AlarmReason: "最近一分钟时间内收到其他节点请求拉取block消息次数大于60次", MetricsType: TypeRate1, AlarmValue: 1, AlarmMsgCode: textMsgCode, }, HandleBlockHashMsg_meterName: { AlarmReason: "最近一分钟时间内普通节点收到广播的稳定块hash的次数大于6000次", MetricsType: TypeRate1, AlarmValue: 100, AlarmMsgCode: textMsgCode, }, HandleGetConfirmsMsg_meterName: { AlarmReason: "最近一分钟时间内收到其他节点请求拉取block确认包消息次数大于960次", MetricsType: TypeRate1, AlarmValue: 16, AlarmMsgCode: textMsgCode, }, HandleConfirmMsg_meterName: { AlarmReason: "最近一分钟时间内收到其他节点广播过来的区块确认包的次数大于960", MetricsType: TypeRate1, AlarmValue: 16, AlarmMsgCode: textMsgCode, }, HandleGetBlocksWithChangeLogMsg_meterName: { AlarmReason: "最近一分钟时间内收到调用handleGetBlocksWithChangeLogMsg请求的次数大于600次", MetricsType: TypeRate1, AlarmValue: 10, AlarmMsgCode: textMsgCode, }, HandleDiscoverReqMsg_meterName: { AlarmReason: "最近一分钟时间内收到调用handleDiscoverReqMsg的次数大于600次", MetricsType: TypeRate1, AlarmValue: 10, AlarmMsgCode: textMsgCode, }, HandleDiscoverResMsg_meterName: { AlarmReason: "最近一分钟时间内收到调用handleDiscoverReqMsg的次数大于600次", MetricsType: TypeRate1, AlarmValue: 10, AlarmMsgCode: textMsgCode, }, PeerConnFailed_meterName: { AlarmReason: "最近一分钟时间内节点连接断开的次数大于5次", MetricsType: TypeRate1, AlarmValue: 0.083, AlarmMsgCode: textMsgCode, }, ReadMsgSuccess_timerName: { AlarmReason: "读取接收节点的Msg所用的平均时间大于6s,有必要升级网络带宽", MetricsType: TypeMean, AlarmValue: 6, AlarmMsgCode: textMsgCode, }, ReadMsgFailed_timerName: { AlarmReason: "最近一分钟时间内读取接收节点的Msg失败的次数大于5次", MetricsType: TypeRate1, AlarmValue: 0.083, AlarmMsgCode: textMsgCode, }, WriteMsgSuccess_timerName: { AlarmReason: "发送Msg给其他节点的平均用时超过5s,有必要升级网络带宽", MetricsType: TypeMean, AlarmValue: 5, AlarmMsgCode: textMsgCode, }, WriteMsgFailed_timerName: { AlarmReason: "最近一分钟时间内发送Msg给其他节点失败的次数超过5次", MetricsType: TypeRate1, AlarmValue: 0.083, AlarmMsgCode: textMsgCode, }, VerifyFailedTx_meterName: { AlarmReason: "最近一分钟时间内交易验证失败的的次数超过了30笔", MetricsType: TypeRate1, AlarmValue: 0.5, AlarmMsgCode: textMsgCode, }, BlockInsert_timerName: { AlarmReason: "Insert chain 所用平均时间大于5s", MetricsType: TypeMean, AlarmValue: 5, AlarmMsgCode: textMsgCode, }, MineBlock_timerName: { AlarmReason: "Mine Block 所用平均时间大于15s", MetricsType: TypeMean, AlarmValue: 15, AlarmMsgCode: textMsgCode, }, VerifyBlock_meterName: { AlarmReason: "最近一分钟时间内收到2个以上InsertBlock校验不通过的block", MetricsType: TypeRate1, AlarmValue: 0.033, AlarmMsgCode: textMsgCode, }, UnStableBlock_meterName: { AlarmReason: "未稳定块已经超过了设置的过度期区块总数的十分之九了", MetricsType: TypeRate1, AlarmValue: 0.016, AlarmMsgCode: textMsgCode, }, LevelDb_miss_meterName: { AlarmReason: "最近一分钟时间内从leveldb中读取数据失败次数大于10次", MetricsType: TypeRate1, AlarmValue: 0.16, AlarmMsgCode: textMsgCode, }, }
告警规则表
Functions ¶
func CollectProcessMetrics ¶
CollectProcessMetrics periodically collects various metrics about the running process.
func NewAlarmManager ¶
func NewAlarmManager() *alarmManager
func NewCounter ¶
func NewCounter(name string) metrics.Counter
NewCounter create a new metrics Counter, either a real one of a NOP stub depending on the metrics flag.
func NewMeter ¶
func NewMeter(name string) metrics.Meter
NewMeter create a new metrics Meter, either a real one of a NOP stub depending on the metrics flag.
func NewTimer ¶
func NewTimer(name string) metrics.Timer
NewTimer create a new metrics Timer, either a real one of a NOP stub depending on the metrics flag.
func PointMetricsLog ¶
func PointMetricsLog()
func ReadDiskStats ¶
ReadDiskStats retrieves the disk IO stats belonging to the current process.
func SprintMetrics ¶
返回出给定name的metrics的[]string
func WriteMetricsData ¶
WriteMetricsData 收集统计数据
Types ¶
type Condition ¶
type Condition struct { AlarmReason string // 告警的理由 MetricsType metricsType // 需要告警的度量类型 AlarmValue float64 // 触发告警的临界度量值 TimeStamp time.Time // 用于记录上次告警时间 AlarmMsgCode uint32 // 发送告警消息类型,目前只支持text类型 }
验证触发告警条件
type DiskStats ¶
type DiskStats struct { ReadCount int64 // Number of read operations executed ReadBytes int64 // Total number of bytes read WriteCount int64 // Number of write operations executed WriteBytes int64 // Total number of byte written }
DiskStats is the per process disk io stats.