p2p: fix Peer shutdown deadlocks
There were multiple synchronization issues in the disconnect handling, all caused by the odd special-casing of Peer.readLoop errors. Remove the special handling of read errors and make readLoop part of the Peer WaitGroup. Thanks to @Gustav-Simonsson for pointing at arrows in a diagram and playing rubber-duck.
This commit is contained in:
parent
22d1f0faf1
commit
f1d710af00
|
@ -409,7 +409,7 @@ func readProtocolHandshake(r MsgReader, our *protoHandshake) (*protoHandshake, e
|
||||||
// spec and we send it ourself if Server.addPeer fails.
|
// spec and we send it ourself if Server.addPeer fails.
|
||||||
var reason DiscReason
|
var reason DiscReason
|
||||||
rlp.Decode(msg.Payload, &reason)
|
rlp.Decode(msg.Payload, &reason)
|
||||||
return nil, discRequestedError(reason)
|
return nil, reason
|
||||||
}
|
}
|
||||||
if msg.Code != handshakeMsg {
|
if msg.Code != handshakeMsg {
|
||||||
return nil, fmt.Errorf("expected handshake, got %x", msg.Code)
|
return nil, fmt.Errorf("expected handshake, got %x", msg.Code)
|
||||||
|
|
131
p2p/peer.go
131
p2p/peer.go
|
@ -44,7 +44,7 @@ type Peer struct {
|
||||||
rw *conn
|
rw *conn
|
||||||
running map[string]*protoRW
|
running map[string]*protoRW
|
||||||
|
|
||||||
protoWG sync.WaitGroup
|
wg sync.WaitGroup
|
||||||
protoErr chan error
|
protoErr chan error
|
||||||
closed chan struct{}
|
closed chan struct{}
|
||||||
disc chan DiscReason
|
disc chan DiscReason
|
||||||
|
@ -102,58 +102,50 @@ func (p *Peer) String() string {
|
||||||
|
|
||||||
func newPeer(fd net.Conn, conn *conn, protocols []Protocol) *Peer {
|
func newPeer(fd net.Conn, conn *conn, protocols []Protocol) *Peer {
|
||||||
logtag := fmt.Sprintf("Peer %.8x %v", conn.ID[:], fd.RemoteAddr())
|
logtag := fmt.Sprintf("Peer %.8x %v", conn.ID[:], fd.RemoteAddr())
|
||||||
|
protomap := matchProtocols(protocols, conn.Caps, conn)
|
||||||
p := &Peer{
|
p := &Peer{
|
||||||
Logger: logger.NewLogger(logtag),
|
Logger: logger.NewLogger(logtag),
|
||||||
conn: fd,
|
conn: fd,
|
||||||
rw: conn,
|
rw: conn,
|
||||||
running: matchProtocols(protocols, conn.Caps, conn),
|
running: protomap,
|
||||||
disc: make(chan DiscReason),
|
disc: make(chan DiscReason),
|
||||||
protoErr: make(chan error),
|
protoErr: make(chan error, len(protomap)+1), // protocols + pingLoop
|
||||||
closed: make(chan struct{}),
|
closed: make(chan struct{}),
|
||||||
}
|
}
|
||||||
return p
|
return p
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Peer) run() DiscReason {
|
func (p *Peer) run() DiscReason {
|
||||||
var readErr = make(chan error, 1)
|
readErr := make(chan error, 1)
|
||||||
defer p.closeProtocols()
|
p.wg.Add(2)
|
||||||
defer close(p.closed)
|
go p.readLoop(readErr)
|
||||||
|
go p.pingLoop()
|
||||||
|
|
||||||
p.startProtocols()
|
p.startProtocols()
|
||||||
go func() { readErr <- p.readLoop() }()
|
|
||||||
|
|
||||||
ping := time.NewTicker(pingInterval)
|
|
||||||
defer ping.Stop()
|
|
||||||
|
|
||||||
// Wait for an error or disconnect.
|
// Wait for an error or disconnect.
|
||||||
var reason DiscReason
|
var reason DiscReason
|
||||||
loop:
|
select {
|
||||||
for {
|
case err := <-readErr:
|
||||||
select {
|
if r, ok := err.(DiscReason); ok {
|
||||||
case <-ping.C:
|
reason = r
|
||||||
go func() {
|
break
|
||||||
if err := SendItems(p.rw, pingMsg); err != nil {
|
|
||||||
p.protoErr <- err
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
case err := <-readErr:
|
|
||||||
// We rely on protocols to abort if there is a write error. It
|
|
||||||
// might be more robust to handle them here as well.
|
|
||||||
p.DebugDetailf("Read error: %v\n", err)
|
|
||||||
p.conn.Close()
|
|
||||||
return DiscNetworkError
|
|
||||||
case err := <-p.protoErr:
|
|
||||||
reason = discReasonForError(err)
|
|
||||||
break loop
|
|
||||||
case reason = <-p.disc:
|
|
||||||
break loop
|
|
||||||
}
|
}
|
||||||
|
// Note: We rely on protocols to abort if there is a write
|
||||||
|
// error. It might be more robust to handle them here as well.
|
||||||
|
p.DebugDetailf("Read error: %v\n", err)
|
||||||
|
p.conn.Close()
|
||||||
|
reason = DiscNetworkError
|
||||||
|
case err := <-p.protoErr:
|
||||||
|
reason = discReasonForError(err)
|
||||||
|
case reason = <-p.disc:
|
||||||
}
|
}
|
||||||
p.politeDisconnect(reason)
|
|
||||||
|
|
||||||
// Wait for readLoop. It will end because conn is now closed.
|
close(p.closed)
|
||||||
<-readErr
|
p.wg.Wait()
|
||||||
|
if reason != DiscNetworkError {
|
||||||
|
p.politeDisconnect(reason)
|
||||||
|
}
|
||||||
p.Debugf("Disconnected: %v\n", reason)
|
p.Debugf("Disconnected: %v\n", reason)
|
||||||
return reason
|
return reason
|
||||||
}
|
}
|
||||||
|
@ -174,18 +166,37 @@ func (p *Peer) politeDisconnect(reason DiscReason) {
|
||||||
p.conn.Close()
|
p.conn.Close()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Peer) readLoop() error {
|
func (p *Peer) pingLoop() {
|
||||||
|
ping := time.NewTicker(pingInterval)
|
||||||
|
defer p.wg.Done()
|
||||||
|
defer ping.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ping.C:
|
||||||
|
if err := SendItems(p.rw, pingMsg); err != nil {
|
||||||
|
p.protoErr <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
case <-p.closed:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *Peer) readLoop(errc chan<- error) {
|
||||||
|
defer p.wg.Done()
|
||||||
for {
|
for {
|
||||||
p.conn.SetDeadline(time.Now().Add(frameReadTimeout))
|
p.conn.SetDeadline(time.Now().Add(frameReadTimeout))
|
||||||
msg, err := p.rw.ReadMsg()
|
msg, err := p.rw.ReadMsg()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
errc <- err
|
||||||
|
return
|
||||||
}
|
}
|
||||||
if err = p.handle(msg); err != nil {
|
if err = p.handle(msg); err != nil {
|
||||||
return err
|
errc <- err
|
||||||
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Peer) handle(msg Msg) error {
|
func (p *Peer) handle(msg Msg) error {
|
||||||
|
@ -195,12 +206,11 @@ func (p *Peer) handle(msg Msg) error {
|
||||||
go SendItems(p.rw, pongMsg)
|
go SendItems(p.rw, pongMsg)
|
||||||
case msg.Code == discMsg:
|
case msg.Code == discMsg:
|
||||||
var reason [1]DiscReason
|
var reason [1]DiscReason
|
||||||
// no need to discard or for error checking, we'll close the
|
// This is the last message. We don't need to discard or
|
||||||
// connection after this.
|
// check errors because, the connection will be closed after it.
|
||||||
rlp.Decode(msg.Payload, &reason)
|
rlp.Decode(msg.Payload, &reason)
|
||||||
p.Debugf("Disconnect requested: %v\n", reason[0])
|
p.Debugf("Disconnect requested: %v\n", reason[0])
|
||||||
p.Disconnect(DiscRequested)
|
return DiscRequested
|
||||||
return discRequestedError(reason[0])
|
|
||||||
case msg.Code < baseProtocolLength:
|
case msg.Code < baseProtocolLength:
|
||||||
// ignore other base protocol messages
|
// ignore other base protocol messages
|
||||||
return msg.Discard()
|
return msg.Discard()
|
||||||
|
@ -210,7 +220,12 @@ func (p *Peer) handle(msg Msg) error {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("msg code out of range: %v", msg.Code)
|
return fmt.Errorf("msg code out of range: %v", msg.Code)
|
||||||
}
|
}
|
||||||
proto.in <- msg
|
select {
|
||||||
|
case proto.in <- msg:
|
||||||
|
return nil
|
||||||
|
case <-p.closed:
|
||||||
|
return io.EOF
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
@ -234,10 +249,11 @@ outer:
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Peer) startProtocols() {
|
func (p *Peer) startProtocols() {
|
||||||
|
p.wg.Add(len(p.running))
|
||||||
for _, proto := range p.running {
|
for _, proto := range p.running {
|
||||||
proto := proto
|
proto := proto
|
||||||
|
proto.closed = p.closed
|
||||||
p.DebugDetailf("Starting protocol %s/%d\n", proto.Name, proto.Version)
|
p.DebugDetailf("Starting protocol %s/%d\n", proto.Name, proto.Version)
|
||||||
p.protoWG.Add(1)
|
|
||||||
go func() {
|
go func() {
|
||||||
err := proto.Run(p, proto)
|
err := proto.Run(p, proto)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
@ -246,11 +262,8 @@ func (p *Peer) startProtocols() {
|
||||||
} else {
|
} else {
|
||||||
p.DebugDetailf("Protocol %s/%d error: %v\n", proto.Name, proto.Version, err)
|
p.DebugDetailf("Protocol %s/%d error: %v\n", proto.Name, proto.Version, err)
|
||||||
}
|
}
|
||||||
select {
|
p.protoErr <- err
|
||||||
case p.protoErr <- err:
|
p.wg.Done()
|
||||||
case <-p.closed:
|
|
||||||
}
|
|
||||||
p.protoWG.Done()
|
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -266,13 +279,6 @@ func (p *Peer) getProto(code uint64) (*protoRW, error) {
|
||||||
return nil, newPeerError(errInvalidMsgCode, "%d", code)
|
return nil, newPeerError(errInvalidMsgCode, "%d", code)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Peer) closeProtocols() {
|
|
||||||
for _, p := range p.running {
|
|
||||||
close(p.in)
|
|
||||||
}
|
|
||||||
p.protoWG.Wait()
|
|
||||||
}
|
|
||||||
|
|
||||||
// writeProtoMsg sends the given message on behalf of the given named protocol.
|
// writeProtoMsg sends the given message on behalf of the given named protocol.
|
||||||
// this exists because of Server.Broadcast.
|
// this exists because of Server.Broadcast.
|
||||||
func (p *Peer) writeProtoMsg(protoName string, msg Msg) error {
|
func (p *Peer) writeProtoMsg(protoName string, msg Msg) error {
|
||||||
|
@ -289,8 +295,8 @@ func (p *Peer) writeProtoMsg(protoName string, msg Msg) error {
|
||||||
|
|
||||||
type protoRW struct {
|
type protoRW struct {
|
||||||
Protocol
|
Protocol
|
||||||
|
|
||||||
in chan Msg
|
in chan Msg
|
||||||
|
closed <-chan struct{}
|
||||||
offset uint64
|
offset uint64
|
||||||
w MsgWriter
|
w MsgWriter
|
||||||
}
|
}
|
||||||
|
@ -304,10 +310,11 @@ func (rw *protoRW) WriteMsg(msg Msg) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (rw *protoRW) ReadMsg() (Msg, error) {
|
func (rw *protoRW) ReadMsg() (Msg, error) {
|
||||||
msg, ok := <-rw.in
|
select {
|
||||||
if !ok {
|
case msg := <-rw.in:
|
||||||
return msg, io.EOF
|
msg.Code -= rw.offset
|
||||||
|
return msg, nil
|
||||||
|
case <-rw.closed:
|
||||||
|
return Msg{}, io.EOF
|
||||||
}
|
}
|
||||||
msg.Code -= rw.offset
|
|
||||||
return msg, nil
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -98,15 +98,13 @@ func (d DiscReason) String() string {
|
||||||
return discReasonToString[d]
|
return discReasonToString[d]
|
||||||
}
|
}
|
||||||
|
|
||||||
type discRequestedError DiscReason
|
func (d DiscReason) Error() string {
|
||||||
|
return d.String()
|
||||||
func (err discRequestedError) Error() string {
|
|
||||||
return fmt.Sprintf("disconnect requested: %v", DiscReason(err))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func discReasonForError(err error) DiscReason {
|
func discReasonForError(err error) DiscReason {
|
||||||
if reason, ok := err.(discRequestedError); ok {
|
if reason, ok := err.(DiscReason); ok {
|
||||||
return DiscReason(reason)
|
return reason
|
||||||
}
|
}
|
||||||
peerError, ok := err.(*peerError)
|
peerError, ok := err.(*peerError)
|
||||||
if !ok {
|
if !ok {
|
||||||
|
|
|
@ -2,8 +2,9 @@ package p2p
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"math/rand"
|
||||||
"net"
|
"net"
|
||||||
"reflect"
|
"reflect"
|
||||||
"testing"
|
"testing"
|
||||||
|
@ -27,7 +28,7 @@ var discard = Protocol{
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
func testPeer(protos []Protocol) (io.Closer, *conn, *Peer, <-chan DiscReason) {
|
func testPeer(protos []Protocol) (func(), *conn, *Peer, <-chan DiscReason) {
|
||||||
fd1, _ := net.Pipe()
|
fd1, _ := net.Pipe()
|
||||||
hs1 := &protoHandshake{ID: randomID(), Version: baseProtocolVersion}
|
hs1 := &protoHandshake{ID: randomID(), Version: baseProtocolVersion}
|
||||||
hs2 := &protoHandshake{ID: randomID(), Version: baseProtocolVersion}
|
hs2 := &protoHandshake{ID: randomID(), Version: baseProtocolVersion}
|
||||||
|
@ -41,7 +42,11 @@ func testPeer(protos []Protocol) (io.Closer, *conn, *Peer, <-chan DiscReason) {
|
||||||
errc := make(chan DiscReason, 1)
|
errc := make(chan DiscReason, 1)
|
||||||
go func() { errc <- peer.run() }()
|
go func() { errc <- peer.run() }()
|
||||||
|
|
||||||
return p1, &conn{p2, hs2}, peer, errc
|
closer := func() {
|
||||||
|
p1.Close()
|
||||||
|
fd1.Close()
|
||||||
|
}
|
||||||
|
return closer, &conn{p2, hs2}, peer, errc
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestPeerProtoReadMsg(t *testing.T) {
|
func TestPeerProtoReadMsg(t *testing.T) {
|
||||||
|
@ -67,7 +72,7 @@ func TestPeerProtoReadMsg(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
closer, rw, _, errc := testPeer([]Protocol{proto})
|
closer, rw, _, errc := testPeer([]Protocol{proto})
|
||||||
defer closer.Close()
|
defer closer()
|
||||||
|
|
||||||
Send(rw, baseProtocolLength+2, []uint{1})
|
Send(rw, baseProtocolLength+2, []uint{1})
|
||||||
Send(rw, baseProtocolLength+3, []uint{2})
|
Send(rw, baseProtocolLength+3, []uint{2})
|
||||||
|
@ -99,7 +104,7 @@ func TestPeerProtoEncodeMsg(t *testing.T) {
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
closer, rw, _, _ := testPeer([]Protocol{proto})
|
closer, rw, _, _ := testPeer([]Protocol{proto})
|
||||||
defer closer.Close()
|
defer closer()
|
||||||
|
|
||||||
if err := ExpectMsg(rw, 17, []string{"foo", "bar"}); err != nil {
|
if err := ExpectMsg(rw, 17, []string{"foo", "bar"}); err != nil {
|
||||||
t.Error(err)
|
t.Error(err)
|
||||||
|
@ -110,7 +115,7 @@ func TestPeerWriteForBroadcast(t *testing.T) {
|
||||||
defer testlog(t).detach()
|
defer testlog(t).detach()
|
||||||
|
|
||||||
closer, rw, peer, peerErr := testPeer([]Protocol{discard})
|
closer, rw, peer, peerErr := testPeer([]Protocol{discard})
|
||||||
defer closer.Close()
|
defer closer()
|
||||||
|
|
||||||
emptymsg := func(code uint64) Msg {
|
emptymsg := func(code uint64) Msg {
|
||||||
return Msg{Code: code, Size: 0, Payload: bytes.NewReader(nil)}
|
return Msg{Code: code, Size: 0, Payload: bytes.NewReader(nil)}
|
||||||
|
@ -150,7 +155,7 @@ func TestPeerPing(t *testing.T) {
|
||||||
defer testlog(t).detach()
|
defer testlog(t).detach()
|
||||||
|
|
||||||
closer, rw, _, _ := testPeer(nil)
|
closer, rw, _, _ := testPeer(nil)
|
||||||
defer closer.Close()
|
defer closer()
|
||||||
if err := SendItems(rw, pingMsg); err != nil {
|
if err := SendItems(rw, pingMsg); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
@ -163,19 +168,70 @@ func TestPeerDisconnect(t *testing.T) {
|
||||||
defer testlog(t).detach()
|
defer testlog(t).detach()
|
||||||
|
|
||||||
closer, rw, _, disc := testPeer(nil)
|
closer, rw, _, disc := testPeer(nil)
|
||||||
defer closer.Close()
|
defer closer()
|
||||||
if err := SendItems(rw, discMsg, DiscQuitting); err != nil {
|
if err := SendItems(rw, discMsg, DiscQuitting); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
if err := ExpectMsg(rw, discMsg, []interface{}{DiscRequested}); err != nil {
|
if err := ExpectMsg(rw, discMsg, []interface{}{DiscRequested}); err != nil {
|
||||||
t.Error(err)
|
t.Error(err)
|
||||||
}
|
}
|
||||||
closer.Close() // make test end faster
|
closer()
|
||||||
if reason := <-disc; reason != DiscRequested {
|
if reason := <-disc; reason != DiscRequested {
|
||||||
t.Errorf("run returned wrong reason: got %v, want %v", reason, DiscRequested)
|
t.Errorf("run returned wrong reason: got %v, want %v", reason, DiscRequested)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This test is supposed to verify that Peer can reliably handle
|
||||||
|
// multiple causes of disconnection occurring at the same time.
|
||||||
|
func TestPeerDisconnectRace(t *testing.T) {
|
||||||
|
defer testlog(t).detach()
|
||||||
|
maybe := func() bool { return rand.Intn(1) == 1 }
|
||||||
|
|
||||||
|
for i := 0; i < 1000; i++ {
|
||||||
|
protoclose := make(chan error)
|
||||||
|
protodisc := make(chan DiscReason)
|
||||||
|
closer, rw, p, disc := testPeer([]Protocol{
|
||||||
|
{
|
||||||
|
Name: "closereq",
|
||||||
|
Run: func(p *Peer, rw MsgReadWriter) error { return <-protoclose },
|
||||||
|
Length: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "disconnect",
|
||||||
|
Run: func(p *Peer, rw MsgReadWriter) error { p.Disconnect(<-protodisc); return nil },
|
||||||
|
Length: 1,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
// Simulate incoming messages.
|
||||||
|
go SendItems(rw, baseProtocolLength+1)
|
||||||
|
go SendItems(rw, baseProtocolLength+2)
|
||||||
|
// Close the network connection.
|
||||||
|
go closer()
|
||||||
|
// Make protocol "closereq" return.
|
||||||
|
protoclose <- errors.New("protocol closed")
|
||||||
|
// Make protocol "disconnect" call peer.Disconnect
|
||||||
|
protodisc <- DiscAlreadyConnected
|
||||||
|
// In some cases, simulate something else calling peer.Disconnect.
|
||||||
|
if maybe() {
|
||||||
|
go p.Disconnect(DiscInvalidIdentity)
|
||||||
|
}
|
||||||
|
// In some cases, simulate remote requesting a disconnect.
|
||||||
|
if maybe() {
|
||||||
|
go SendItems(rw, discMsg, DiscQuitting)
|
||||||
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-disc:
|
||||||
|
case <-time.After(2 * time.Second):
|
||||||
|
// Peer.run should return quickly. If it doesn't the Peer
|
||||||
|
// goroutines are probably deadlocked. Call panic in order to
|
||||||
|
// show the stacks.
|
||||||
|
panic("Peer.run took to long to return.")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNewPeer(t *testing.T) {
|
func TestNewPeer(t *testing.T) {
|
||||||
name := "nodename"
|
name := "nodename"
|
||||||
caps := []Cap{{"foo", 2}, {"bar", 3}}
|
caps := []Cap{{"foo", 2}, {"bar", 3}}
|
||||||
|
|
Loading…
Reference in New Issue