From 57d4898e2992a46fc2deab93a2666a2979b6704c Mon Sep 17 00:00:00 2001 From: Felix Lange Date: Mon, 17 Feb 2020 15:23:25 +0100 Subject: [PATCH] p2p/dnsdisc: re-check tree root when leaf resolution fails (#20682) This adds additional logic to re-resolve the root name of a tree when a couple of leaf requests have failed. We need this change to avoid getting into a failure state where leaf requests keep failing for half an hour when the tree has been updated. --- p2p/dnsdisc/client_test.go | 66 ++++++++++++++++++++++++++++++++------ p2p/dnsdisc/sync.go | 65 +++++++++++++++++++++++++++++++------ 2 files changed, 113 insertions(+), 18 deletions(-) diff --git a/p2p/dnsdisc/client_test.go b/p2p/dnsdisc/client_test.go index 1a1d5ade95..6a6705abf2 100644 --- a/p2p/dnsdisc/client_test.go +++ b/p2p/dnsdisc/client_test.go @@ -19,6 +19,7 @@ package dnsdisc import ( "context" "crypto/ecdsa" + "errors" "math/rand" "reflect" "testing" @@ -176,11 +177,62 @@ func TestIteratorNodeUpdates(t *testing.T) { t.Fatal(err) } - // sync the original tree. + // Sync the original tree. resolver.add(tree1.ToTXT("n")) checkIterator(t, it, nodes[:25]) - // Update some nodes and ensure RandomNode returns the new nodes as well. + // Ensure RandomNode returns the new nodes after the tree is updated. + updateSomeNodes(nodesSeed1, nodes) + tree2, _ := makeTestTree("n", nodes, nil) + resolver.clear() + resolver.add(tree2.ToTXT("n")) + t.Log("tree updated") + + clock.Run(c.cfg.RecheckInterval + 1*time.Second) + checkIterator(t, it, nodes) +} + +// This test checks that the tree root is rechecked when a couple of leaf +// requests have failed. The test is just like TestIteratorNodeUpdates, but +// without advancing the clock by recheckInterval after the tree update. +func TestIteratorRootRecheckOnFail(t *testing.T) { + var ( + clock = new(mclock.Simulated) + nodes = testNodes(nodesSeed1, 30) + resolver = newMapResolver() + c = NewClient(Config{ + Resolver: resolver, + Logger: testlog.Logger(t, log.LvlTrace), + RecheckInterval: 20 * time.Minute, + RateLimit: 500, + // Disabling the cache is required for this test because the client doesn't + // notice leaf failures if all records are cached. + CacheLimit: 1, + }) + ) + c.clock = clock + tree1, url := makeTestTree("n", nodes[:25], nil) + it, err := c.NewIterator(url) + if err != nil { + t.Fatal(err) + } + + // Sync the original tree. + resolver.add(tree1.ToTXT("n")) + checkIterator(t, it, nodes[:25]) + + // Ensure RandomNode returns the new nodes after the tree is updated. + updateSomeNodes(nodesSeed1, nodes) + tree2, _ := makeTestTree("n", nodes, nil) + resolver.clear() + resolver.add(tree2.ToTXT("n")) + t.Log("tree updated") + + checkIterator(t, it, nodes) +} + +// updateSomeNodes applies ENR updates to some of the given nodes. +func updateSomeNodes(keySeed int64, nodes []*enode.Node) { keys := testKeys(nodesSeed1, len(nodes)) for i, n := range nodes[:len(nodes)/2] { r := n.Record() @@ -190,11 +242,6 @@ func TestIteratorNodeUpdates(t *testing.T) { n2, _ := enode.New(enode.ValidSchemes, r) nodes[i] = n2 } - tree2, _ := makeTestTree("n", nodes, nil) - clock.Run(c.cfg.RecheckInterval + 1*time.Second) - resolver.clear() - resolver.add(tree2.ToTXT("n")) - checkIterator(t, it, nodes) } // This test verifies that randomIterator re-checks the root of the tree to catch @@ -230,9 +277,10 @@ func TestIteratorLinkUpdates(t *testing.T) { // Add link to tree3, remove link to tree2. tree1, _ = makeTestTree("t1", nodes[:10], []string{url3}) resolver.add(tree1.ToTXT("t1")) - clock.Run(c.cfg.RecheckInterval + 1*time.Second) t.Log("tree1 updated") + clock.Run(c.cfg.RecheckInterval + 1*time.Second) + var wantNodes []*enode.Node wantNodes = append(wantNodes, tree1.Nodes()...) wantNodes = append(wantNodes, tree3.Nodes()...) @@ -345,5 +393,5 @@ func (mr mapResolver) LookupTXT(ctx context.Context, name string) ([]string, err if record, ok := mr[name]; ok { return []string{record}, nil } - return nil, nil + return nil, errors.New("not found") } diff --git a/p2p/dnsdisc/sync.go b/p2p/dnsdisc/sync.go index 53423527a5..36f02acba6 100644 --- a/p2p/dnsdisc/sync.go +++ b/p2p/dnsdisc/sync.go @@ -25,15 +25,22 @@ import ( "github.com/ethereum/go-ethereum/p2p/enode" ) +const ( + rootRecheckFailCount = 5 // update root if this many leaf requests fail +) + // clientTree is a full tree being synced. type clientTree struct { c *Client loc *linkEntry // link to this tree lastRootCheck mclock.AbsTime // last revalidation of root - root *rootEntry - enrs *subtreeSync - links *subtreeSync + leafFailCount int + rootFailCount int + + root *rootEntry + enrs *subtreeSync + links *subtreeSync lc *linkCache // tracks all links between all trees curLinks map[string]struct{} // links contained in this tree @@ -46,7 +53,7 @@ func newClientTree(c *Client, lc *linkCache, loc *linkEntry) *clientTree { // syncAll retrieves all entries of the tree. func (ct *clientTree) syncAll(dest map[string]entry) error { - if err := ct.updateRoot(); err != nil { + if err := ct.updateRoot(context.Background()); err != nil { return err } if err := ct.links.resolveAll(dest); err != nil { @@ -60,12 +67,20 @@ func (ct *clientTree) syncAll(dest map[string]entry) error { // syncRandom retrieves a single entry of the tree. The Node return value // is non-nil if the entry was a node. -func (ct *clientTree) syncRandom(ctx context.Context) (*enode.Node, error) { +func (ct *clientTree) syncRandom(ctx context.Context) (n *enode.Node, err error) { if ct.rootUpdateDue() { - if err := ct.updateRoot(); err != nil { + if err := ct.updateRoot(ctx); err != nil { return nil, err } } + + // Update fail counter for leaf request errors. + defer func() { + if err != nil { + ct.leafFailCount++ + } + }() + // Link tree sync has priority, run it to completion before syncing ENRs. if !ct.links.done() { err := ct.syncNextLink(ctx) @@ -138,15 +153,22 @@ func removeHash(h []string, index int) []string { } // updateRoot ensures that the given tree has an up-to-date root. -func (ct *clientTree) updateRoot() error { +func (ct *clientTree) updateRoot(ctx context.Context) error { + if !ct.slowdownRootUpdate(ctx) { + return ctx.Err() + } + ct.lastRootCheck = ct.c.clock.Now() - ctx, cancel := context.WithTimeout(context.Background(), ct.c.cfg.Timeout) + ctx, cancel := context.WithTimeout(ctx, ct.c.cfg.Timeout) defer cancel() root, err := ct.c.resolveRoot(ctx, ct.loc) if err != nil { + ct.rootFailCount++ return err } ct.root = &root + ct.rootFailCount = 0 + ct.leafFailCount = 0 // Invalidate subtrees if changed. if ct.links == nil || root.lroot != ct.links.root { @@ -161,7 +183,32 @@ func (ct *clientTree) updateRoot() error { // rootUpdateDue returns true when a root update is needed. func (ct *clientTree) rootUpdateDue() bool { - return ct.root == nil || time.Duration(ct.c.clock.Now()-ct.lastRootCheck) > ct.c.cfg.RecheckInterval + tooManyFailures := ct.leafFailCount > rootRecheckFailCount + scheduledCheck := ct.c.clock.Now().Sub(ct.lastRootCheck) > ct.c.cfg.RecheckInterval + return ct.root == nil || tooManyFailures || scheduledCheck +} + +// slowdownRootUpdate applies a delay to root resolution if is tried +// too frequently. This avoids busy polling when the client is offline. +// Returns true if the timeout passed, false if sync was canceled. +func (ct *clientTree) slowdownRootUpdate(ctx context.Context) bool { + var delay time.Duration + switch { + case ct.rootFailCount > 20: + delay = 10 * time.Second + case ct.rootFailCount > 5: + delay = 5 * time.Second + default: + return true + } + timeout := ct.c.clock.NewTimer(delay) + defer timeout.Stop() + select { + case <-timeout.C(): + return true + case <-ctx.Done(): + return false + } } // subtreeSync is the sync of an ENR or link subtree.