Skip to content

Commit da5972b

Browse files
Add gauges for allocated memory for queued UDP and TCP packages (prometheus#1503)
* Two new states will be added to the tcpstat collector called rx_queued_bytes and tx_queued_bytes. For UDP datagrams an additional collector 'udp_queues' can be used to expose the total lengths of the tx_queue and rx_queue. @SuperQ and @discordianfish this changes gives us the option to check for overloaded UDP + TCP processing. The names of the new TCP states and the UDP metric can be discussed. The current reasons are just: I don't want to add another collector for the same exposed file, so I just added the new states to the tcpstat collector. I chose the name 'udp_queue' instead of 'udpstat' as UDP has no state. Signed-off-by: Peter Bueschel <peter.bueschel@logmein.com>
1 parent 4891b01 commit da5972b

9 files changed

Lines changed: 189 additions & 4 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
* [FEATURE] Add Btrfs collector #1512
4242
* [FEATURE] Add RAPL collector #1523
4343
* [FEATURE] Add new softnet collector #1576
44+
* [FEATURE] Add new udp_queues collector #1503
4445
* [ENHANCEMENT] Log pid when there is a problem reading the process stats #1341
4546
* [ENHANCEMENT] Collect InfiniBand port state and physical state #1357
4647
* [ENHANCEMENT] Include additional XFS runtime statistics. #1423

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ textfile | Exposes statistics read from local disk. The `--collector.textfile.di
6060
thermal\_zone | Exposes thermal zone & cooling device statistics from `/sys/class/thermal`. | Linux
6161
time | Exposes the current system time. | _any_
6262
timex | Exposes selected adjtimex(2) system call stats. | Linux
63+
udp_queues | Exposes UDP total lengths of the rx_queue and tx_queue from `/proc/net/udp` and `/proc/net/udp6`. | Linux
6364
uname | Exposes system information as provided by the uname system call. | Darwin, FreeBSD, Linux, OpenBSD
6465
vmstat | Exposes statistics from `/proc/vmstat`. | Linux
6566
xfs | Exposes XFS runtime statistics. | Linux (kernel 4.4+)

collector/fixtures/e2e-output.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2644,6 +2644,7 @@ node_scrape_collector_success{collector="softnet"} 1
26442644
node_scrape_collector_success{collector="stat"} 1
26452645
node_scrape_collector_success{collector="textfile"} 1
26462646
node_scrape_collector_success{collector="thermal_zone"} 1
2647+
node_scrape_collector_success{collector="udp_queues"} 1
26472648
node_scrape_collector_success{collector="vmstat"} 1
26482649
node_scrape_collector_success{collector="wifi"} 1
26492650
node_scrape_collector_success{collector="xfs"} 1
@@ -2734,6 +2735,10 @@ node_textfile_scrape_error 0
27342735
# HELP node_thermal_zone_temp Zone temperature in Celsius
27352736
# TYPE node_thermal_zone_temp gauge
27362737
node_thermal_zone_temp{type="cpu-thermal",zone="0"} 12.376
2738+
# HELP node_udp_queues Number of allocated memory in the kernel for UDP datagrams in bytes.
2739+
# TYPE node_udp_queues gauge
2740+
node_udp_queues{ip="v4",queue="rx"} 0
2741+
node_udp_queues{ip="v4",queue="tx"} 21
27372742
# HELP node_vmstat_oom_kill /proc/vmstat information field oom_kill.
27382743
# TYPE node_vmstat_oom_kill untyped
27392744
node_vmstat_oom_kill 0
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode
2-
0: 00000000:0016 00000000:0000 0A 00000000:00000000 00:00000000 00000000 0 0 2740 1 ffff88003d3af3c0 100 0 0 10 0
3-
1: 0F02000A:0016 0202000A:8B6B 01 00000000:00000000 02:000AC99B 00000000 0 0 3652 4 ffff88003d3ae040 21 4 31 47 46
2+
0: 00000000:0016 00000000:0000 0A 00000015:00000000 00:00000000 00000000 0 0 2740 1 ffff88003d3af3c0 100 0 0 10 0
3+
1: 0F02000A:0016 0202000A:8B6B 01 00000015:00000001 02:000AC99B 00000000 0 0 3652 4 ffff88003d3ae040 21 4 31 47 46

collector/fixtures/proc/net/udp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode
2+
0: 00000000:0016 00000000:0000 0A 00000015:00000000 00:00000000 00000000 0 0 2740 1 ffff88003d3af3c0 100 0 0 10 0

collector/tcpstat_linux.go

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,10 @@ const (
5252
tcpListen
5353
// TCP_CLOSING
5454
tcpClosing
55+
// TCP_RX_BUFFER
56+
tcpRxQueuedBytes
57+
// TCP_TX_BUFFER
58+
tcpTxQueuedBytes
5559
)
5660

5761
type tcpStatCollector struct {
@@ -122,16 +126,34 @@ func parseTCPStats(r io.Reader) (map[tcpConnectionState]float64, error) {
122126
if len(parts) == 0 {
123127
continue
124128
}
125-
if len(parts) < 4 {
129+
if len(parts) < 5 {
126130
return nil, fmt.Errorf("invalid TCP stats line: %q", line)
127131
}
128132

133+
qu := strings.Split(parts[4], ":")
134+
if len(qu) < 2 {
135+
return nil, fmt.Errorf("cannot parse tx_queues and rx_queues: %q", line)
136+
}
137+
138+
tx, err := strconv.ParseUint(qu[0], 16, 64)
139+
if err != nil {
140+
return nil, err
141+
}
142+
tcpStats[tcpConnectionState(tcpTxQueuedBytes)] += float64(tx)
143+
144+
rx, err := strconv.ParseUint(qu[1], 16, 64)
145+
if err != nil {
146+
return nil, err
147+
}
148+
tcpStats[tcpConnectionState(tcpRxQueuedBytes)] += float64(rx)
149+
129150
st, err := strconv.ParseInt(parts[3], 16, 8)
130151
if err != nil {
131152
return nil, err
132153
}
133154

134155
tcpStats[tcpConnectionState(st)]++
156+
135157
}
136158

137159
return tcpStats, nil
@@ -161,6 +183,10 @@ func (st tcpConnectionState) String() string {
161183
return "listen"
162184
case tcpClosing:
163185
return "closing"
186+
case tcpRxQueuedBytes:
187+
return "rx_queued_bytes"
188+
case tcpTxQueuedBytes:
189+
return "tx_queued_bytes"
164190
default:
165191
return "unknown"
166192
}

collector/tcpstat_linux_test.go

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,27 @@ func Test_parseTCPStatsError(t *testing.T) {
2828
name: "too few fields",
2929
in: "sl local_address\n 0: 00000000:0016",
3030
},
31+
{
32+
name: "missing colon in tx-rx field",
33+
in: "sl local_address rem_address st tx_queue rx_queue\n" +
34+
" 1: 0F02000A:0016 0202000A:8B6B 01 0000000000000001",
35+
},
36+
{
37+
name: "tx parsing issue",
38+
in: "sl local_address rem_address st tx_queue rx_queue\n" +
39+
" 1: 0F02000A:0016 0202000A:8B6B 01 0000000x:00000001",
40+
},
41+
{
42+
name: "rx parsing issue",
43+
in: "sl local_address rem_address st tx_queue rx_queue\n" +
44+
" 1: 0F02000A:0016 0202000A:8B6B 01 00000000:0000000x",
45+
},
46+
{
47+
name: "state parsing issue",
48+
in: "sl local_address rem_address st tx_queue rx_queue\n" +
49+
" 1: 0F02000A:0016 0202000A:8B6B 0H 00000000:00000001",
50+
},
3151
}
32-
3352
for _, tt := range tests {
3453
t.Run(tt.name, func(t *testing.T) {
3554
if _, err := parseTCPStats(strings.NewReader(tt.in)); err == nil {
@@ -40,6 +59,14 @@ func Test_parseTCPStatsError(t *testing.T) {
4059
}
4160

4261
func TestTCPStat(t *testing.T) {
62+
63+
noFile, _ := os.Open("follow the white rabbit")
64+
defer noFile.Close()
65+
66+
if _, err := parseTCPStats(noFile); err == nil {
67+
t.Fatal("expected an error, but none occurred")
68+
}
69+
4370
file, err := os.Open("fixtures/proc/net/tcpstat")
4471
if err != nil {
4572
t.Fatal(err)
@@ -58,4 +85,39 @@ func TestTCPStat(t *testing.T) {
5885
if want, got := 1, int(tcpStats[tcpListen]); want != got {
5986
t.Errorf("want tcpstat number of listen state %d, got %d", want, got)
6087
}
88+
89+
if want, got := 42, int(tcpStats[tcpTxQueuedBytes]); want != got {
90+
t.Errorf("want tcpstat number of bytes in tx queue %d, got %d", want, got)
91+
}
92+
if want, got := 1, int(tcpStats[tcpRxQueuedBytes]); want != got {
93+
t.Errorf("want tcpstat number of bytes in rx queue %d, got %d", want, got)
94+
}
95+
96+
}
97+
98+
func Test_getTCPStats(t *testing.T) {
99+
type args struct {
100+
statsFile string
101+
}
102+
tests := []struct {
103+
name string
104+
args args
105+
wantErr bool
106+
}{
107+
{
108+
name: "file not found",
109+
args: args{statsFile: "somewhere over the rainbow"},
110+
wantErr: true,
111+
},
112+
}
113+
for _, tt := range tests {
114+
t.Run(tt.name, func(t *testing.T) {
115+
_, err := getTCPStats(tt.args.statsFile)
116+
if (err != nil) != tt.wantErr {
117+
t.Errorf("getTCPStats() error = %v, wantErr %v", err, tt.wantErr)
118+
return
119+
}
120+
// other cases are covered by TestTCPStat()
121+
})
122+
}
61123
}

collector/udp_queues_linux.go

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
// Copyright 2015 The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
// +build !noudp_queues
15+
16+
package collector
17+
18+
import (
19+
"fmt"
20+
"os"
21+
22+
"github.com/go-kit/kit/log"
23+
"github.com/go-kit/kit/log/level"
24+
"github.com/prometheus/client_golang/prometheus"
25+
"github.com/prometheus/procfs"
26+
)
27+
28+
type (
29+
udpQueuesCollector struct {
30+
fs procfs.FS
31+
desc *prometheus.Desc
32+
logger log.Logger
33+
}
34+
)
35+
36+
func init() {
37+
registerCollector("udp_queues", defaultEnabled, NewUDPqueuesCollector)
38+
}
39+
40+
// NewUDPqueuesCollector returns a new Collector exposing network udp queued bytes.
41+
func NewUDPqueuesCollector(logger log.Logger) (Collector, error) {
42+
fs, err := procfs.NewFS(*procPath)
43+
if err != nil {
44+
return nil, fmt.Errorf("failed to open procfs: %v", err)
45+
}
46+
return &udpQueuesCollector{
47+
fs: fs,
48+
desc: prometheus.NewDesc(
49+
prometheus.BuildFQName(namespace, "udp", "queues"),
50+
"Number of allocated memory in the kernel for UDP datagrams in bytes.",
51+
[]string{"queue", "ip"}, nil,
52+
),
53+
logger: logger,
54+
}, nil
55+
}
56+
57+
func (c *udpQueuesCollector) Update(ch chan<- prometheus.Metric) error {
58+
59+
s4, errIPv4 := c.fs.NetUDPSummary()
60+
if errIPv4 == nil {
61+
ch <- prometheus.MustNewConstMetric(c.desc, prometheus.GaugeValue, float64(s4.TxQueueLength), "tx", "v4")
62+
ch <- prometheus.MustNewConstMetric(c.desc, prometheus.GaugeValue, float64(s4.RxQueueLength), "rx", "v4")
63+
} else {
64+
if os.IsNotExist(errIPv4) {
65+
level.Debug(c.logger).Log("msg", "not collecting ipv4 based metrics")
66+
} else {
67+
return fmt.Errorf("couldn't get upd queued bytes: %s", errIPv4)
68+
}
69+
}
70+
71+
s6, errIPv6 := c.fs.NetUDP6Summary()
72+
if errIPv6 == nil {
73+
ch <- prometheus.MustNewConstMetric(c.desc, prometheus.GaugeValue, float64(s6.TxQueueLength), "tx", "v6")
74+
ch <- prometheus.MustNewConstMetric(c.desc, prometheus.GaugeValue, float64(s6.RxQueueLength), "rx", "v6")
75+
} else {
76+
if os.IsNotExist(errIPv6) {
77+
level.Debug(c.logger).Log("msg", "not collecting ipv6 based metrics")
78+
} else {
79+
return fmt.Errorf("couldn't get upd6 queued bytes: %s", errIPv6)
80+
}
81+
}
82+
83+
if os.IsNotExist(errIPv4) && os.IsNotExist(errIPv6) {
84+
return ErrNoData
85+
}
86+
return nil
87+
}

end-to-end-test.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ enabled_collectors=$(cat << COLLECTORS
3838
thermal_zone
3939
textfile
4040
bonding
41+
udp_queues
4142
vmstat
4243
wifi
4344
xfs

0 commit comments

Comments
 (0)