Compare commits

...

3 Commits

Author SHA1 Message Date
corpix 15db9f8c89
Merge dd13cb1d03 into 63dd116924 2025-07-03 11:19:38 +02:00
Nick Garlis 63dd116924
Automatically set socket read & write buffer sizes (#312)
This is an attempt to port the logic that nftables uses to automatically
adjust the recvmsg & sndmsg buffer sizes. The implementation of setting
sndmsg size is not the same as nftables due to some limitations in the
underlying netlink & socket libraries.

- https://git.netfilter.org/nftables/tree/src/mnl.c?id=713592c6008a8c589a00d3d3d2e49709ff2de62c#n262
- https://git.netfilter.org/libmnl/tree/include/libmnl/libmnl.h?id=03da98bcd284d55212bc79e91dfb63da0ef7b937#n20
- https://git.netfilter.org/nftables/tree/src/mnl.c?id=713592c6008a8c589a00d3d3d2e49709ff2de62c#n391
- https://git.netfilter.org/nftables/tree/src/mnl.c?id=713592c6008a8c589a00d3d3d2e49709ff2de62c#n426

We should not enlarge the socket buffers when:
 - We are using a test dial (there are no buffers to enlarge).
 - A connection has been initialized with socket options which means
   that the user could have specified fixed buffer sizes.
2025-07-02 09:53:17 +02:00
Dmitry Moskowski dd13cb1d03 Replace %v with %w to wrap underlying errors 2025-04-05 21:03:30 +00:00
11 changed files with 178 additions and 35 deletions

View File

@ -215,7 +215,7 @@ func (cc *Conn) ListChain(table *Table, chain string) (*Chain, error) {
response, err := conn.Execute(msg)
if err != nil {
return nil, fmt.Errorf("conn.Execute failed: %v", err)
return nil, fmt.Errorf("conn.Execute failed: %w", err)
}
if got, want := len(response), 1; got != want {

112
conn.go
View File

@ -116,7 +116,9 @@ func WithTestDial(f nltest.Func) ConnOption {
}
// WithSockOptions sets the specified socket options when creating a new netlink
// connection.
// connection. Note that when using WithSockOptions, you are responsible for
// providing a large-enough read and write buffer, whereas normally, the
// nftables package automatically enlarges the buffers as needed.
func WithSockOptions(opts ...SockOption) ConnOption {
return func(cc *Conn) {
cc.sockOptions = append(cc.sockOptions, opts...)
@ -250,6 +252,13 @@ func (cc *Conn) Flush() error {
}
defer func() { _ = closer() }()
if err := cc.enlargeWriteBuffer(conn); err != nil {
return err
}
if err := cc.enlargeReadBuffer(conn); err != nil {
return err
}
messages, err := conn.SendMessages(batch(cc.messages))
if err != nil {
return fmt.Errorf("SendMessages: %w", err)
@ -423,3 +432,104 @@ func (cc *Conn) allocateTransactionID() uint32 {
}
return cc.lastID
}
// getMessageSize returns the total size of all messages in the buffer.
func (cc *Conn) getMessageSize() int {
var total int
for _, msg := range cc.messages {
total += len(msg.Data) + unix.NLMSG_HDRLEN
}
return total
}
// canEnlargeBuffers returns true if the connection can automatically enlarge
// the write and read buffers of the netlink connection.
func (cc *Conn) canEnlargeBuffers() bool {
// If there are sock options, we assume that the user has already set the
// buffers to a fixed size.
if len(cc.sockOptions) > 0 {
return false
}
if cc.TestDial != nil {
return false
}
return true
}
// enlargeWriteBuffer automatically sets the write buffer of the given
// connection to the accumulated message size. This is only done if the current
// write buffer is smaller than the message size.
//
// nftables actually handles this differently, it multiplies the number of
// iovec entries by 2MB. This is not possible to do here as our underlying
// netlink and socket libraries will only add a single iovec entry and
// won't expose the number of entries.
// https://git.netfilter.org/nftables/tree/src/mnl.c?id=713592c6008a8c589a00d3d3d2e49709ff2de62c#n262
//
// TODO: Update this function to mimic the behavior of nftables once our
// socket library supports multiple iovec entries.
func (cc *Conn) enlargeWriteBuffer(conn *netlink.Conn) error {
if !cc.canEnlargeBuffers() {
return nil
}
messageSize := cc.getMessageSize()
writeBuffer, err := conn.WriteBuffer()
if err != nil {
return err
}
if writeBuffer < messageSize {
return conn.SetWriteBuffer(messageSize)
}
return nil
}
// getDefaultEchoReadBuffer returns the minimum read buffer size for batches
// with echo messages.
//
// See https://git.netfilter.org/libmnl/tree/include/libmnl/libmnl.h?id=03da98bcd284d55212bc79e91dfb63da0ef7b937#n20
// and https://git.netfilter.org/nftables/tree/src/mnl.c?id=713592c6008a8c589a00d3d3d2e49709ff2de62c#n391
func (cc *Conn) getDefaultEchoReadBuffer() int {
pageSize := os.Getpagesize()
return max(pageSize, 8192) * 1024
}
// enlargeReadBuffer automatically sets the read buffer of the given connection
// to the required size. This is only done if the current read buffer is smaller
// than the required size.
//
// See https://git.netfilter.org/nftables/tree/src/mnl.c?id=713592c6008a8c589a00d3d3d2e49709ff2de62c#n426
func (cc *Conn) enlargeReadBuffer(conn *netlink.Conn) error {
if !cc.canEnlargeBuffers() {
return nil
}
var bufferSize int
// If there are any messages with the Echo flag, we initialize the buffer size
// to the default echo read buffer size.
for _, msg := range cc.messages {
if msg.Header.Flags&netlink.Echo == 0 {
bufferSize = cc.getDefaultEchoReadBuffer()
break
}
}
// Just like nftables, we allocate 1024 bytes for each message in the batch.
requiredSize := len(cc.messages) * 1024
if bufferSize < requiredSize {
bufferSize = requiredSize
}
currSize, err := conn.ReadBuffer()
if err != nil {
return err
}
if currSize < bufferSize {
return conn.SetReadBuffer(bufferSize)
}
return nil
}

View File

@ -66,7 +66,7 @@ func (e *Immediate) unmarshal(fam byte, data []byte) error {
case unix.NFTA_IMMEDIATE_DATA:
nestedAD, err := netlink.NewAttributeDecoder(ad.Bytes())
if err != nil {
return fmt.Errorf("nested NewAttributeDecoder() failed: %v", err)
return fmt.Errorf("nested NewAttributeDecoder() failed: %w", err)
}
for nestedAD.Next() {
switch nestedAD.Type() {
@ -75,7 +75,7 @@ func (e *Immediate) unmarshal(fam byte, data []byte) error {
}
}
if nestedAD.Err() != nil {
return fmt.Errorf("decoding immediate: %v", nestedAD.Err())
return fmt.Errorf("decoding immediate: %w", nestedAD.Err())
}
}
}

View File

@ -111,7 +111,7 @@ func (e *Verdict) unmarshal(fam byte, data []byte) error {
case unix.NFTA_IMMEDIATE_DATA:
nestedAD, err := netlink.NewAttributeDecoder(ad.Bytes())
if err != nil {
return fmt.Errorf("nested NewAttributeDecoder() failed: %v", err)
return fmt.Errorf("nested NewAttributeDecoder() failed: %w", err)
}
for nestedAD.Next() {
switch nestedAD.Type() {
@ -123,7 +123,7 @@ func (e *Verdict) unmarshal(fam byte, data []byte) error {
}
}
if nestedAD.Err() != nil {
return fmt.Errorf("decoding immediate: %v", nestedAD.Err())
return fmt.Errorf("decoding immediate: %w", nestedAD.Err())
}
}
}

View File

@ -214,12 +214,12 @@ func (cc *Conn) getFlowtables(t *Table) ([]netlink.Message, error) {
}
if _, err := conn.SendMessages([]netlink.Message{message}); err != nil {
return nil, fmt.Errorf("SendMessages: %v", err)
return nil, fmt.Errorf("SendMessages: %w", err)
}
reply, err := receiveAckAware(conn, message.Header.Flags)
if err != nil {
return nil, fmt.Errorf("receiveAckAware: %v", err)
return nil, fmt.Errorf("receiveAckAware: %w", err)
}
return reply, nil

4
go.mod
View File

@ -4,14 +4,14 @@ go 1.23.0
require (
github.com/google/go-cmp v0.6.0
github.com/mdlayher/netlink v1.7.3-0.20250113171957-fbb4dce95f42
github.com/mdlayher/netlink v1.7.3-0.20250702063131-0f7746f74615
github.com/vishvananda/netlink v1.3.0
github.com/vishvananda/netns v0.0.4
golang.org/x/sys v0.31.0
)
require (
github.com/mdlayher/socket v0.5.0 // indirect
github.com/mdlayher/socket v0.5.1 // indirect
golang.org/x/net v0.38.0 // indirect
golang.org/x/sync v0.6.0 // indirect
)

8
go.sum
View File

@ -1,9 +1,9 @@
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/mdlayher/netlink v1.7.3-0.20250113171957-fbb4dce95f42 h1:A1Cq6Ysb0GM0tpKMbdCXCIfBclan4oHk1Jb+Hrejirg=
github.com/mdlayher/netlink v1.7.3-0.20250113171957-fbb4dce95f42/go.mod h1:BB4YCPDOzfy7FniQ/lxuYQ3dgmM2cZumHbK8RpTjN2o=
github.com/mdlayher/socket v0.5.0 h1:ilICZmJcQz70vrWVes1MFera4jGiWNocSkykwwoy3XI=
github.com/mdlayher/socket v0.5.0/go.mod h1:WkcBFfvyG8QENs5+hfQPl1X6Jpd2yeLIYgrGFmJiJxI=
github.com/mdlayher/netlink v1.7.3-0.20250702063131-0f7746f74615 h1:5T2ai+PpYFKe+tyNj/ZxePZGiYoG5xDOylT30nywJUU=
github.com/mdlayher/netlink v1.7.3-0.20250702063131-0f7746f74615/go.mod h1:ZlWrPUV9wyD64k5skWrIv4WDQmmiUbkNnCkBtEWYKwU=
github.com/mdlayher/socket v0.5.1 h1:VZaqt6RkGkt2OE9l3GcC6nZkqD3xKeQLyfleW/uBcos=
github.com/mdlayher/socket v0.5.1/go.mod h1:TjPLHI1UgwEv5J1B5q0zTZq12A/6H7nKmtTanQE37IQ=
github.com/vishvananda/netlink v1.3.0 h1:X7l42GfcV4S6E4vHTsw48qbrV+9PVojNfIhZcwQdrZk=
github.com/vishvananda/netlink v1.3.0/go.mod h1:i6NetklAujEcC6fK0JPjT8qSwWyO0HLn4UKG+hGqeJs=
github.com/vishvananda/netns v0.0.4 h1:Oeaw1EM2JMxD51g9uhtC0D7erkIjgmj8+JZc26m1YX8=

View File

@ -7396,3 +7396,36 @@ func TestSetElementComment(t *testing.T) {
}
}
}
func TestAutoBufferSize(t *testing.T) {
conn, newNS := nftest.OpenSystemConn(t, *enableSysTests)
defer nftest.CleanupSystemConn(t, newNS)
conn.FlushRuleset()
defer conn.FlushRuleset()
table := conn.AddTable(&nftables.Table{
Family: nftables.TableFamilyIPv4,
Name: "test-table",
})
chain := conn.AddChain(&nftables.Chain{
Name: "test-chain",
Table: table,
})
for range 4096 {
conn.AddRule(&nftables.Rule{
Table: table,
Chain: chain,
Exprs: []expr.Any{
&expr.Verdict{
Kind: expr.VerdictAccept,
},
},
})
}
if err := conn.Flush(); err != nil {
t.Fatalf("failed to flush: %v", err)
}
}

4
obj.go
View File

@ -361,12 +361,12 @@ func (cc *Conn) getObjWithLegacyType(o Obj, t *Table, msgType uint16, returnLega
}
if _, err := conn.SendMessages([]netlink.Message{message}); err != nil {
return nil, fmt.Errorf("SendMessages: %v", err)
return nil, fmt.Errorf("SendMessages: %w", err)
}
reply, err := receiveAckAware(conn, message.Header.Flags)
if err != nil {
return nil, fmt.Errorf("receiveAckAware: %v", err)
return nil, fmt.Errorf("receiveAckAware: %w", err)
}
var objs []Obj
for _, msg := range reply {

View File

@ -101,12 +101,12 @@ func (cc *Conn) GetRules(t *Table, c *Chain) ([]*Rule, error) {
}
if _, err := conn.SendMessages([]netlink.Message{message}); err != nil {
return nil, fmt.Errorf("SendMessages: %v", err)
return nil, fmt.Errorf("SendMessages: %w", err)
}
reply, err := receiveAckAware(conn, message.Header.Flags)
if err != nil {
return nil, fmt.Errorf("receiveAckAware: %v", err)
return nil, fmt.Errorf("receiveAckAware: %w", err)
}
var rules []*Rule
for _, msg := range reply {

34
set.go
View File

@ -298,7 +298,7 @@ func (s *SetElement) decode(fam byte) func(b []byte) error {
return func(b []byte) error {
ad, err := netlink.NewAttributeDecoder(b)
if err != nil {
return fmt.Errorf("failed to create nested attribute decoder: %v", err)
return fmt.Errorf("failed to create nested attribute decoder: %w", err)
}
ad.ByteOrder = binary.BigEndian
@ -353,7 +353,7 @@ func (s *SetElement) decode(fam byte) func(b []byte) error {
func decodeElement(d []byte) ([]byte, error) {
ad, err := netlink.NewAttributeDecoder(d)
if err != nil {
return nil, fmt.Errorf("failed to create nested attribute decoder: %v", err)
return nil, fmt.Errorf("failed to create nested attribute decoder: %w", err)
}
ad.ByteOrder = binary.BigEndian
var b []byte
@ -414,14 +414,14 @@ func (cc *Conn) appendElemList(s *Set, vals []SetElement, hdrType uint16) error
encodedKey, err := netlink.MarshalAttributes([]netlink.Attribute{{Type: unix.NFTA_DATA_VALUE, Data: v.Key}})
if err != nil {
return fmt.Errorf("marshal key %d: %v", i, err)
return fmt.Errorf("marshal key %d: %w", i, err)
}
item = append(item, netlink.Attribute{Type: unix.NFTA_SET_ELEM_KEY | unix.NLA_F_NESTED, Data: encodedKey})
if len(v.KeyEnd) > 0 {
encodedKeyEnd, err := netlink.MarshalAttributes([]netlink.Attribute{{Type: unix.NFTA_DATA_VALUE, Data: v.KeyEnd}})
if err != nil {
return fmt.Errorf("marshal key end %d: %v", i, err)
return fmt.Errorf("marshal key end %d: %w", i, err)
}
item = append(item, netlink.Attribute{Type: NFTA_SET_ELEM_KEY_END | unix.NLA_F_NESTED, Data: encodedKeyEnd})
}
@ -441,7 +441,7 @@ func (cc *Conn) appendElemList(s *Set, vals []SetElement, hdrType uint16) error
{Type: unix.NFTA_DATA_VALUE, Data: binaryutil.BigEndian.PutUint32(uint32(v.VerdictData.Kind))},
})
if err != nil {
return fmt.Errorf("marshal item %d: %v", i, err)
return fmt.Errorf("marshal item %d: %w", i, err)
}
encodedVal = append(encodedVal, encodedKind...)
if len(v.VerdictData.Chain) != 0 {
@ -449,21 +449,21 @@ func (cc *Conn) appendElemList(s *Set, vals []SetElement, hdrType uint16) error
{Type: unix.NFTA_SET_ELEM_DATA, Data: []byte(v.VerdictData.Chain + "\x00")},
})
if err != nil {
return fmt.Errorf("marshal item %d: %v", i, err)
return fmt.Errorf("marshal item %d: %w", i, err)
}
encodedVal = append(encodedVal, encodedChain...)
}
encodedVerdict, err := netlink.MarshalAttributes([]netlink.Attribute{
{Type: unix.NFTA_SET_ELEM_DATA | unix.NLA_F_NESTED, Data: encodedVal}})
if err != nil {
return fmt.Errorf("marshal item %d: %v", i, err)
return fmt.Errorf("marshal item %d: %w", i, err)
}
item = append(item, netlink.Attribute{Type: unix.NFTA_SET_ELEM_DATA | unix.NLA_F_NESTED, Data: encodedVerdict})
case len(v.Val) > 0:
// Since v.Val's length is not 0 then, v is a regular map element, need to add to the attributes
encodedVal, err := netlink.MarshalAttributes([]netlink.Attribute{{Type: unix.NFTA_DATA_VALUE, Data: v.Val}})
if err != nil {
return fmt.Errorf("marshal item %d: %v", i, err)
return fmt.Errorf("marshal item %d: %w", i, err)
}
item = append(item, netlink.Attribute{Type: unix.NFTA_SET_ELEM_DATA | unix.NLA_F_NESTED, Data: encodedVal})
@ -479,7 +479,7 @@ func (cc *Conn) appendElemList(s *Set, vals []SetElement, hdrType uint16) error
encodedItem, err := netlink.MarshalAttributes(item)
if err != nil {
return fmt.Errorf("marshal item %d: %v", i, err)
return fmt.Errorf("marshal item %d: %w", i, err)
}
itemSize := unix.NLA_HDRLEN + len(encodedItem)
@ -496,7 +496,7 @@ func (cc *Conn) appendElemList(s *Set, vals []SetElement, hdrType uint16) error
for _, batch := range batches {
encodedElem, err := netlink.MarshalAttributes(batch)
if err != nil {
return fmt.Errorf("marshal elements: %v", err)
return fmt.Errorf("marshal elements: %w", err)
}
message := []netlink.Attribute{
@ -591,7 +591,7 @@ func (cc *Conn) AddSet(s *Set, vals []SetElement) error {
{Type: unix.NFTA_DATA_VALUE, Data: binaryutil.BigEndian.PutUint32(uint32(len(vals)))},
})
if err != nil {
return fmt.Errorf("fail to marshal number of elements %d: %v", len(vals), err)
return fmt.Errorf("fail to marshal number of elements %d: %w", len(vals), err)
}
tableInfo = append(tableInfo, netlink.Attribute{Type: unix.NLA_F_NESTED | unix.NFTA_SET_DESC, Data: numberOfElements})
}
@ -620,7 +620,7 @@ func (cc *Conn) AddSet(s *Set, vals []SetElement) error {
{Type: unix.NFTA_DATA_VALUE, Data: binaryutil.BigEndian.PutUint32(v.Bytes)},
})
if err != nil {
return fmt.Errorf("fail to marshal element key size %d: %v", i, err)
return fmt.Errorf("fail to marshal element key size %d: %w", i, err)
}
// Marshal base type size description
descSize, err := netlink.MarshalAttributes([]netlink.Attribute{
@ -634,7 +634,7 @@ func (cc *Conn) AddSet(s *Set, vals []SetElement) error {
// Marshal all base type descriptions into concatenation size description
concatBytes, err := netlink.MarshalAttributes([]netlink.Attribute{{Type: unix.NLA_F_NESTED | NFTA_SET_DESC_CONCAT, Data: concatDefinition}})
if err != nil {
return fmt.Errorf("fail to marshal concat definition %v", err)
return fmt.Errorf("fail to marshal concat definition %w", err)
}
descBytes = append(descBytes, concatBytes...)
@ -890,12 +890,12 @@ func (cc *Conn) GetSets(t *Table) ([]*Set, error) {
}
if _, err := conn.SendMessages([]netlink.Message{message}); err != nil {
return nil, fmt.Errorf("SendMessages: %v", err)
return nil, fmt.Errorf("SendMessages: %w", err)
}
reply, err := receiveAckAware(conn, message.Header.Flags)
if err != nil {
return nil, fmt.Errorf("receiveAckAware: %v", err)
return nil, fmt.Errorf("receiveAckAware: %w", err)
}
var sets []*Set
for _, msg := range reply {
@ -980,12 +980,12 @@ func (cc *Conn) GetSetElements(s *Set) ([]SetElement, error) {
}
if _, err := conn.SendMessages([]netlink.Message{message}); err != nil {
return nil, fmt.Errorf("SendMessages: %v", err)
return nil, fmt.Errorf("SendMessages: %w", err)
}
reply, err := receiveAckAware(conn, message.Header.Flags)
if err != nil {
return nil, fmt.Errorf("receiveAckAware: %v", err)
return nil, fmt.Errorf("receiveAckAware: %w", err)
}
var elems []SetElement
for _, msg := range reply {