@@ -19,6 +19,7 @@ import (
1919 "fmt"
2020 "path/filepath"
2121 "strconv"
22+ "sync"
2223
2324 "github.com/go-kit/kit/log"
2425 "github.com/go-kit/kit/log/level"
@@ -35,6 +36,8 @@ type cpuCollector struct {
3536 cpuCoreThrottle * prometheus.Desc
3637 cpuPackageThrottle * prometheus.Desc
3738 logger log.Logger
39+ cpuStats []procfs.CPUStat
40+ cpuStatsMutex sync.Mutex
3841}
3942
4043var (
@@ -203,7 +206,12 @@ func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error {
203206 return err
204207 }
205208
206- for cpuID , cpuStat := range stats .CPU {
209+ c .updateCPUStats (stats .CPU )
210+
211+ // Acquire a lock to read the stats.
212+ c .cpuStatsMutex .Lock ()
213+ defer c .cpuStatsMutex .Unlock ()
214+ for cpuID , cpuStat := range c .cpuStats {
207215 cpuNum := strconv .Itoa (cpuID )
208216 ch <- prometheus .MustNewConstMetric (c .cpu , prometheus .CounterValue , cpuStat .User , cpuNum , "user" )
209217 ch <- prometheus .MustNewConstMetric (c .cpu , prometheus .CounterValue , cpuStat .Nice , cpuNum , "nice" )
@@ -221,3 +229,78 @@ func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error {
221229
222230 return nil
223231}
232+
233+ // updateCPUStats updates the internal cache of CPU stats.
234+ func (c * cpuCollector ) updateCPUStats (newStats []procfs.CPUStat ) {
235+ // Acquire a lock to update the stats.
236+ c .cpuStatsMutex .Lock ()
237+ defer c .cpuStatsMutex .Unlock ()
238+
239+ // Reset the cache if the list of CPUs has changed.
240+ if len (c .cpuStats ) != len (newStats ) {
241+ c .cpuStats = make ([]procfs.CPUStat , len (newStats ))
242+ }
243+
244+ for i , n := range newStats {
245+ // If idle jumps backwards, assume we had a hotplug event and reset the stats for this CPU.
246+ if n .Idle < c .cpuStats [i ].Idle {
247+ level .Warn (c .logger ).Log ("msg" , "CPU Idle counter jumped backwards, possible hotplug event, resetting CPU stats" , "cpu" , i , "old_value" , c .cpuStats [i ].Idle , "new_value" , n .Idle )
248+ c .cpuStats [i ] = procfs.CPUStat {}
249+ }
250+ c .cpuStats [i ].Idle = n .Idle
251+
252+ if n .User >= c .cpuStats [i ].User {
253+ c .cpuStats [i ].User = n .User
254+ } else {
255+ level .Warn (c .logger ).Log ("msg" , "CPU User counter jumped backwards" , "cpu" , i , "old_value" , c .cpuStats [i ].User , "new_value" , n .User )
256+ }
257+
258+ if n .Nice >= c .cpuStats [i ].Nice {
259+ c .cpuStats [i ].Nice = n .Nice
260+ } else {
261+ level .Warn (c .logger ).Log ("msg" , "CPU Nice counter jumped backwards" , "cpu" , i , "old_value" , c .cpuStats [i ].Nice , "new_value" , n .Nice )
262+ }
263+
264+ if n .System >= c .cpuStats [i ].System {
265+ c .cpuStats [i ].System = n .System
266+ } else {
267+ level .Warn (c .logger ).Log ("msg" , "CPU System counter jumped backwards" , "cpu" , i , "old_value" , c .cpuStats [i ].System , "new_value" , n .System )
268+ }
269+
270+ if n .Iowait >= c .cpuStats [i ].Iowait {
271+ c .cpuStats [i ].Iowait = n .Iowait
272+ } else {
273+ level .Warn (c .logger ).Log ("msg" , "CPU Iowait counter jumped backwards" , "cpu" , i , "old_value" , c .cpuStats [i ].Iowait , "new_value" , n .Iowait )
274+ }
275+
276+ if n .IRQ >= c .cpuStats [i ].IRQ {
277+ c .cpuStats [i ].IRQ = n .IRQ
278+ } else {
279+ level .Warn (c .logger ).Log ("msg" , "CPU IRQ counter jumped backwards" , "cpu" , i , "old_value" , c .cpuStats [i ].IRQ , "new_value" , n .IRQ )
280+ }
281+
282+ if n .SoftIRQ >= c .cpuStats [i ].SoftIRQ {
283+ c .cpuStats [i ].SoftIRQ = n .SoftIRQ
284+ } else {
285+ level .Warn (c .logger ).Log ("msg" , "CPU SoftIRQ counter jumped backwards" , "cpu" , i , "old_value" , c .cpuStats [i ].SoftIRQ , "new_value" , n .SoftIRQ )
286+ }
287+
288+ if n .Steal >= c .cpuStats [i ].Steal {
289+ c .cpuStats [i ].Steal = n .Steal
290+ } else {
291+ level .Warn (c .logger ).Log ("msg" , "CPU Steal counter jumped backwards" , "cpu" , i , "old_value" , c .cpuStats [i ].Steal , "new_value" , n .Steal )
292+ }
293+
294+ if n .Guest >= c .cpuStats [i ].Guest {
295+ c .cpuStats [i ].Guest = n .Guest
296+ } else {
297+ level .Warn (c .logger ).Log ("msg" , "CPU Guest counter jumped backwards" , "cpu" , i , "old_value" , c .cpuStats [i ].Guest , "new_value" , n .Guest )
298+ }
299+
300+ if n .GuestNice >= c .cpuStats [i ].GuestNice {
301+ c .cpuStats [i ].GuestNice = n .GuestNice
302+ } else {
303+ level .Warn (c .logger ).Log ("msg" , "CPU GuestNice counter jumped backwards" , "cpu" , i , "old_value" , c .cpuStats [i ].GuestNice , "new_value" , n .GuestNice )
304+ }
305+ }
306+ }
0 commit comments