From 6b2b328cdb51a99e2ce55f0bc22eeb9733c9ec30 Mon Sep 17 00:00:00 2001 From: gary rong Date: Mon, 16 Apr 2018 16:18:24 +0800 Subject: [PATCH] ethdb: add leveldb write delay statistic (#16499) --- ethdb/database.go | 95 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 81 insertions(+), 14 deletions(-) diff --git a/ethdb/database.go b/ethdb/database.go index 243b7f8d3c..001d8f0bb9 100644 --- a/ethdb/database.go +++ b/ethdb/database.go @@ -17,6 +17,7 @@ package ethdb import ( + "fmt" "strconv" "strings" "sync" @@ -32,17 +33,25 @@ import ( "github.com/syndtr/goleveldb/leveldb/util" ) +const ( + writeDelayNThreshold = 200 + writeDelayThreshold = 350 * time.Millisecond + writeDelayWarningThrottler = 1 * time.Minute +) + var OpenFileLimit = 64 type LDBDatabase struct { fn string // filename for reporting db *leveldb.DB // LevelDB instance - compTimeMeter metrics.Meter // Meter for measuring the total time spent in database compaction - compReadMeter metrics.Meter // Meter for measuring the data read during compaction - compWriteMeter metrics.Meter // Meter for measuring the data written during compaction - diskReadMeter metrics.Meter // Meter for measuring the effective amount of data read - diskWriteMeter metrics.Meter // Meter for measuring the effective amount of data written + compTimeMeter metrics.Meter // Meter for measuring the total time spent in database compaction + compReadMeter metrics.Meter // Meter for measuring the data read during compaction + compWriteMeter metrics.Meter // Meter for measuring the data written during compaction + writeDelayNMeter metrics.Meter // Meter for measuring the write delay number due to database compaction + writeDelayMeter metrics.Meter // Meter for measuring the write delay duration due to database compaction + diskReadMeter metrics.Meter // Meter for measuring the effective amount of data read + diskWriteMeter metrics.Meter // Meter for measuring the effective amount of data written quitLock sync.Mutex // Mutex protecting the quit channel access quitChan chan chan error // Quit channel to stop the metrics collection before closing the database @@ -147,16 +156,17 @@ func (db *LDBDatabase) LDB() *leveldb.DB { // Meter configures the database metrics collectors and func (db *LDBDatabase) Meter(prefix string) { - // Short circuit metering if the metrics system is disabled - if !metrics.Enabled { - return + if metrics.Enabled { + // Initialize all the metrics collector at the requested prefix + db.compTimeMeter = metrics.NewRegisteredMeter(prefix+"compact/time", nil) + db.compReadMeter = metrics.NewRegisteredMeter(prefix+"compact/input", nil) + db.compWriteMeter = metrics.NewRegisteredMeter(prefix+"compact/output", nil) + db.diskReadMeter = metrics.NewRegisteredMeter(prefix+"disk/read", nil) + db.diskWriteMeter = metrics.NewRegisteredMeter(prefix+"disk/write", nil) } - // Initialize all the metrics collector at the requested prefix - db.compTimeMeter = metrics.NewRegisteredMeter(prefix+"compact/time", nil) - db.compReadMeter = metrics.NewRegisteredMeter(prefix+"compact/input", nil) - db.compWriteMeter = metrics.NewRegisteredMeter(prefix+"compact/output", nil) - db.diskReadMeter = metrics.NewRegisteredMeter(prefix+"disk/read", nil) - db.diskWriteMeter = metrics.NewRegisteredMeter(prefix+"disk/write", nil) + // Initialize write delay metrics no matter we are in metric mode or not. + db.writeDelayMeter = metrics.NewRegisteredMeter(prefix+"compact/writedelay/duration", nil) + db.writeDelayNMeter = metrics.NewRegisteredMeter(prefix+"compact/writedelay/counter", nil) // Create a quit channel for the periodic collector and run it db.quitLock.Lock() @@ -178,6 +188,9 @@ func (db *LDBDatabase) Meter(prefix string) { // 2 | 523 | 1000.37159 | 7.26059 | 66.86342 | 66.77884 // 3 | 570 | 1113.18458 | 0.00000 | 0.00000 | 0.00000 // +// This is how the write delay look like (currently): +// DelayN:5 Delay:406.604657ms +// // This is how the iostats look like (currently): // Read(MB):3895.04860 Write(MB):3654.64712 func (db *LDBDatabase) meter(refresh time.Duration) { @@ -188,6 +201,14 @@ func (db *LDBDatabase) meter(refresh time.Duration) { } // Create storage for iostats. var iostats [2]float64 + + // Create storage and warning log tracer for write delay. + var ( + delaystats [2]int64 + lastWriteDelay time.Time + lastWriteDelayN time.Time + ) + // Iterate ad infinitum and collect the stats for i := 1; ; i++ { // Retrieve the database stats @@ -236,6 +257,52 @@ func (db *LDBDatabase) meter(refresh time.Duration) { db.compWriteMeter.Mark(int64((compactions[i%2][2] - compactions[(i-1)%2][2]) * 1024 * 1024)) } + // Retrieve the write delay statistic + writedelay, err := db.db.GetProperty("leveldb.writedelay") + if err != nil { + db.log.Error("Failed to read database write delay statistic", "err", err) + return + } + var ( + delayN int64 + delayDuration string + duration time.Duration + ) + if n, err := fmt.Sscanf(writedelay, "DelayN:%d Delay:%s", &delayN, &delayDuration); n != 2 || err != nil { + db.log.Error("Write delay statistic not found") + return + } + duration, err = time.ParseDuration(delayDuration) + if err != nil { + db.log.Error("Failed to parse delay duration", "err", err) + return + } + if db.writeDelayNMeter != nil { + db.writeDelayNMeter.Mark(delayN - delaystats[0]) + // If the write delay number been collected in the last minute exceeds the predefined threshold, + // print a warning log here. + // If a warning that db performance is laggy has been displayed, + // any subsequent warnings will be withhold for 1 minute to don't overwhelm the user. + if int(db.writeDelayNMeter.Rate1()) > writeDelayNThreshold && + time.Now().After(lastWriteDelayN.Add(writeDelayWarningThrottler)) { + db.log.Warn("Write delay number exceeds the threshold (200 per second) in the last minute") + lastWriteDelayN = time.Now() + } + } + if db.writeDelayMeter != nil { + db.writeDelayMeter.Mark(duration.Nanoseconds() - delaystats[1]) + // If the write delay duration been collected in the last minute exceeds the predefined threshold, + // print a warning log here. + // If a warning that db performance is laggy has been displayed, + // any subsequent warnings will be withhold for 1 minute to don't overwhelm the user. + if int64(db.writeDelayMeter.Rate1()) > writeDelayThreshold.Nanoseconds() && + time.Now().After(lastWriteDelay.Add(writeDelayWarningThrottler)) { + db.log.Warn("Write delay duration exceeds the threshold (35% of the time) in the last minute") + lastWriteDelay = time.Now() + } + } + delaystats[0], delaystats[1] = delayN, duration.Nanoseconds() + // Retrieve the database iostats. ioStats, err := db.db.GetProperty("leveldb.iostats") if err != nil {